diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a5a802c678e20..c10caaca0c186 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,11 +2,13 @@ name: CI on: push: - branches: [master] + branches: + - master + - 1.3.x pull_request: branches: - master - - 1.2.x + - 1.3.x env: ENV_FILE: environment.yml @@ -123,15 +125,15 @@ jobs: echo "${{ secrets.server_ssh_key }}" > ~/.ssh/id_rsa chmod 600 ~/.ssh/id_rsa echo "${{ secrets.server_ip }} ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBE1Kkopomm7FHG5enATf7SgnpICZ4W2bw+Ho+afqin+w7sMcrsa0je7sbztFAV8YchDkiBKnWTG4cRT+KZgZCaY=" > ~/.ssh/known_hosts - if: github.event_name == 'push' + if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}} - name: Upload web run: rsync -az --delete --exclude='pandas-docs' --exclude='docs' --exclude='Pandas_Cheat_Sheet*' web/build/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas - if: github.event_name == 'push' + if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}} - name: Upload dev docs run: rsync -az --delete doc/build/html/ docs@${{ secrets.server_ip }}:/usr/share/nginx/pandas/pandas-docs/dev - if: github.event_name == 'push' + if: ${{github.event_name == 'push' && github.ref == 'refs/heads/master'}} - name: Move docs into site directory run: mv doc/build/html web/build/docs @@ -163,6 +165,7 @@ jobs: PANDAS_DATA_MANAGER: array PATTERN: ${{ matrix.pattern }} PYTEST_WORKERS: "auto" + PYTEST_TARGET: pandas run: | source activate pandas-dev ci/run_tests.sh diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml index 292598dfcab73..d9e78b197de7a 100644 --- a/.github/workflows/database.yml +++ b/.github/workflows/database.yml @@ -6,7 +6,7 @@ on: pull_request: branches: - master - - 1.2.x + - 1.3.x paths-ignore: - "doc/**" @@ -99,7 +99,7 @@ jobs: run: python ci/print_skipped.py - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v2 with: flags: unittests name: codecov-pandas diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml index cb7d3fb5cabcf..677675dcd2231 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/posix.yml @@ -2,11 +2,13 @@ name: Posix on: push: - branches: [master] + branches: + - master + - 1.3.x pull_request: branches: - master - - 1.2.x + - 1.3.x paths-ignore: - "doc/**" @@ -43,6 +45,7 @@ jobs: LC_ALL: ${{ matrix.settings[4] }} PANDAS_TESTING_MODE: ${{ matrix.settings[5] }} TEST_ARGS: ${{ matrix.settings[6] }} + PYTEST_TARGET: pandas steps: - name: Checkout @@ -90,7 +93,7 @@ jobs: run: python ci/print_skipped.py - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v2 with: flags: unittests name: codecov-pandas diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 723347913ac38..51b52105b483a 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -3,7 +3,9 @@ name: pre-commit on: pull_request: push: - branches: [master] + branches: + - master + - 1.3.x jobs: pre-commit: diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml index 38b1aa9ae7047..894328aee53e1 100644 --- a/.github/workflows/python-dev.yml +++ b/.github/workflows/python-dev.yml @@ -4,12 +4,21 @@ on: push: branches: - master + - 1.3.x pull_request: branches: - master + - 1.3.x paths-ignore: - "doc/**" +env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: "not slow and not network and not clipboard" + COVERAGE: true + PYTEST_TARGET: pandas + jobs: build: runs-on: ubuntu-latest @@ -29,10 +38,10 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip setuptools wheel - pip install git+https://github.com/numpy/numpy.git + pip install -i https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy pip install git+https://github.com/pytest-dev/pytest.git pip install git+https://github.com/nedbat/coveragepy.git - pip install cython python-dateutil pytz hypothesis pytest-xdist + pip install cython python-dateutil pytz hypothesis pytest-xdist pytest-cov pip list - name: Build Pandas @@ -46,7 +55,8 @@ jobs: - name: Test with pytest run: | - coverage run -m pytest -m 'not slow and not network and not clipboard' pandas + ci/run_tests.sh + # GH 41935 continue-on-error: true - name: Publish test results @@ -65,7 +75,7 @@ jobs: coverage report -m - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 + uses: codecov/codecov-action@v2 with: flags: unittests name: codecov-pandas diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml new file mode 100644 index 0000000000000..acb574f2ab8c5 --- /dev/null +++ b/.github/workflows/sdist.yml @@ -0,0 +1,64 @@ +name: sdist + +on: + push: + branches: + - master + - 1.3.x + pull_request: + branches: + - master + - 1.3.x + paths-ignore: + - "doc/**" + +jobs: + build: + runs-on: ubuntu-latest + timeout-minutes: 60 + defaults: + run: + shell: bash -l {0} + + strategy: + fail-fast: false + matrix: + python-version: ["3.7", "3.8", "3.9"] + + steps: + - uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip setuptools wheel + + # GH 39416 + pip install numpy + + - name: Build pandas sdist + run: | + pip list + python setup.py sdist --formats=gztar + + - uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: pandas-sdist + python-version: ${{ matrix.python-version }} + + - name: Install pandas from sdist + run: | + conda list + python -m pip install dist/*.gz + + - name: Import pandas + run: | + cd .. + conda list + python -c "import pandas; pandas.show_versions();" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d580fcf4fc545..e2e6fe51b5f8b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -110,7 +110,7 @@ repos: entry: python scripts/generate_pip_deps_from_conda.py files: ^(environment.yml|requirements-dev.txt)$ pass_filenames: false - additional_dependencies: [pyyaml] + additional_dependencies: [pyyaml, toml] - id: sync-flake8-versions name: Check flake8 version is synced across flake8, yesqa, and environment.yml language: python diff --git a/MANIFEST.in b/MANIFEST.in index d0d93f2cdba8c..f616fad6b1557 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -17,18 +17,19 @@ global-exclude *.h5 global-exclude *.html global-exclude *.json global-exclude *.jsonl +global-exclude *.msgpack global-exclude *.pdf global-exclude *.pickle global-exclude *.png global-exclude *.pptx -global-exclude *.pyc -global-exclude *.pyd global-exclude *.ods global-exclude *.odt +global-exclude *.orc global-exclude *.sas7bdat global-exclude *.sav global-exclude *.so global-exclude *.xls +global-exclude *.xlsb global-exclude *.xlsm global-exclude *.xlsx global-exclude *.xpt @@ -39,6 +40,13 @@ global-exclude .DS_Store global-exclude .git* global-exclude \#* +global-exclude *.c +global-exclude *.cpp +global-exclude *.h + +global-exclude *.py[ocd] +global-exclude *.pxi + # GH 39321 # csv_dir_path fixture checks the existence of the directory # exclude the whole directory to avoid running related tests in sdist @@ -47,3 +55,6 @@ prune pandas/tests/io/parser/data include versioneer.py include pandas/_version.py include pandas/io/formats/templates/*.tpl + +graft pandas/_libs/src +graft pandas/_libs/tslibs/src diff --git a/asv_bench/benchmarks/algos/isin.py b/asv_bench/benchmarks/algos/isin.py index 296101c9f9800..5d7a76bc01d49 100644 --- a/asv_bench/benchmarks/algos/isin.py +++ b/asv_bench/benchmarks/algos/isin.py @@ -1,7 +1,5 @@ import numpy as np -from pandas.compat.numpy import np_version_under1p20 - from pandas import ( Categorical, NaT, @@ -280,10 +278,6 @@ class IsInLongSeriesLookUpDominates: def setup(self, dtype, MaxNumber, series_type): N = 10 ** 7 - # https://github.com/pandas-dev/pandas/issues/39844 - if not np_version_under1p20 and dtype in ("Int64", "Float64"): - raise NotImplementedError - if series_type == "random_hits": array = np.random.randint(0, MaxNumber, N) if series_type == "random_misses": @@ -294,7 +288,8 @@ def setup(self, dtype, MaxNumber, series_type): array = np.arange(N) + MaxNumber self.series = Series(array).astype(dtype) - self.values = np.arange(MaxNumber).astype(dtype) + + self.values = np.arange(MaxNumber).astype(dtype.lower()) def time_isin(self, dtypes, MaxNumber, series_type): self.series.isin(self.values) @@ -310,18 +305,24 @@ class IsInLongSeriesValuesDominate: def setup(self, dtype, series_type): N = 10 ** 7 - # https://github.com/pandas-dev/pandas/issues/39844 - if not np_version_under1p20 and dtype in ("Int64", "Float64"): - raise NotImplementedError - if series_type == "random": vals = np.random.randint(0, 10 * N, N) if series_type == "monotone": vals = np.arange(N) - self.values = vals.astype(dtype) + self.values = vals.astype(dtype.lower()) M = 10 ** 6 + 1 self.series = Series(np.arange(M)).astype(dtype) def time_isin(self, dtypes, series_type): self.series.isin(self.values) + + +class IsInWithLongTupples: + def setup(self): + t = tuple(range(1000)) + self.series = Series([t] * 1000) + self.values = [t] + + def time_isin(self): + self.series.isin(self.values) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index c32eda4928da7..e5834f311d259 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -232,6 +232,22 @@ def time_to_html_mixed(self): self.df2.to_html() +class ToDict: + params = [["dict", "list", "series", "split", "records", "index"]] + param_names = ["orient"] + + def setup(self, orient): + data = np.random.randint(0, 1000, size=(10000, 4)) + self.int_df = DataFrame(data) + self.datetimelike_df = self.int_df.astype("timedelta64[ns]") + + def time_to_dict_ints(self, orient): + self.int_df.to_dict(orient=orient) + + def time_to_dict_datetimelike(self, orient): + self.datetimelike_df.to_dict(orient=orient) + + class ToNumpy: def setup(self): N = 10000 diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 956feaef5f83e..7bda80f4e6024 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -3,29 +3,33 @@ trigger: branches: include: - master - - 1.2.x + - 1.3.x paths: exclude: - 'doc/*' pr: -- master -- 1.2.x + autoCancel: true + branches: + include: + - master + - 1.3.x variables: PYTEST_WORKERS: auto + PYTEST_TARGET: pandas jobs: # Mac and Linux use the same template - template: ci/azure/posix.yml parameters: name: macOS - vmImage: macOS-10.14 + vmImage: macOS-10.15 - template: ci/azure/windows.yml parameters: name: Windows - vmImage: vs2017-win2016 + vmImage: windows-2019 - job: py37_32bit pool: diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index 5644ad46714d5..baa9c0c857c23 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -8,15 +8,47 @@ jobs: vmImage: ${{ parameters.vmImage }} strategy: matrix: - py37_np17: + py37_np17_1: ENV_FILE: ci/deps/azure-windows-37.yaml CONDA_PY: "37" PATTERN: "not slow and not network" + PYTEST_WORKERS: 2 # GH-42236 + PYTEST_TARGET: "pandas/tests/[a-i]*" - py38_np18: + py37_np17_2: + ENV_FILE: ci/deps/azure-windows-37.yaml + CONDA_PY: "37" + PATTERN: "not slow and not network" + PYTEST_WORKERS: 2 # GH-42236 + PYTEST_TARGET: "pandas/tests/[j-z]*" + + py38_np18_1: + ENV_FILE: ci/deps/azure-windows-38.yaml + CONDA_PY: "38" + PATTERN: "not slow and not network" + PYTEST_WORKERS: 2 # GH-42236 + PYTEST_TARGET: "pandas/tests/[a-i]*" + + py38_np18_2: ENV_FILE: ci/deps/azure-windows-38.yaml CONDA_PY: "38" + PATTERN: "not slow and not network" + PYTEST_WORKERS: 2 # GH-42236 + PYTEST_TARGET: "pandas/tests/[j-z]*" + + py39_1: + ENV_FILE: ci/deps/azure-windows-39.yaml + CONDA_PY: "39" + PATTERN: "not slow and not network and not high_memory" + PYTEST_WORKERS: 2 # GH-42236 + PYTEST_TARGET: "pandas/tests/[a-i]*" + + py39_2: + ENV_FILE: ci/deps/azure-windows-39.yaml + CONDA_PY: "39" PATTERN: "not slow and not network and not high_memory" + PYTEST_WORKERS: 2 # GH-42236 + PYTEST_TARGET: "pandas/tests/[j-z]*" steps: - powershell: | @@ -40,6 +72,7 @@ jobs: - bash: | source activate pandas-dev + wmic.exe cpu get caption, deviceid, name, numberofcores, maxclockspeed ci/run_tests.sh displayName: 'Test' diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml index e568f8615a8df..9d680cb8338fd 100644 --- a/ci/deps/actions-37-db.yaml +++ b/ci/deps/actions-37-db.yaml @@ -16,7 +16,7 @@ dependencies: - botocore>=1.11 - dask - fastparquet>=0.4.0 - - fsspec>=0.7.4 + - fsspec>=0.7.4, <2021.6.0 - gcsfs>=0.6.0 - geopandas - html5lib @@ -25,7 +25,7 @@ dependencies: - flask - nomkl - numexpr - - numpy=1.17.* + - numpy=1.18.* - odfpy - openpyxl - pandas-gbq diff --git a/ci/deps/actions-37-slow.yaml b/ci/deps/actions-37-slow.yaml index 166f2237dcad3..76eb7ba5693e9 100644 --- a/ci/deps/actions-37-slow.yaml +++ b/ci/deps/actions-37-slow.yaml @@ -14,7 +14,7 @@ dependencies: # pandas dependencies - beautifulsoup4 - - fsspec>=0.7.4 + - fsspec>=0.7.4, <2021.6.0 - html5lib - lxml - matplotlib diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml index 0effe6f80df86..2272f8470e209 100644 --- a/ci/deps/actions-37.yaml +++ b/ci/deps/actions-37.yaml @@ -14,7 +14,7 @@ dependencies: # pandas dependencies - botocore>=1.11 - - fsspec>=0.7.4 + - fsspec>=0.7.4, <2021.6.0 - numpy=1.19 - python-dateutil - nomkl diff --git a/ci/deps/actions-38-locale.yaml b/ci/deps/actions-38-locale.yaml index 34a6860936550..dfed0df77a327 100644 --- a/ci/deps/actions-38-locale.yaml +++ b/ci/deps/actions-38-locale.yaml @@ -18,7 +18,7 @@ dependencies: - html5lib - ipython - jinja2 - - jedi<0.18.0 + - jedi - lxml - matplotlib<3.3.0 - moto diff --git a/ci/deps/actions-38-numpydev.yaml b/ci/deps/actions-38-numpydev.yaml index 6eed2daac0c3b..e943053f15600 100644 --- a/ci/deps/actions-38-numpydev.yaml +++ b/ci/deps/actions-38-numpydev.yaml @@ -11,11 +11,11 @@ dependencies: - hypothesis>=3.58.0 # pandas dependencies + - python-dateutil - pytz - pip - pip: - cython==0.29.21 # GH#34014 - - "git+git://github.com/dateutil/dateutil.git" - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" - "--pre" - "numpy" diff --git a/ci/deps/actions-38-slow.yaml b/ci/deps/actions-38-slow.yaml index afba60e451b90..c464b30e02203 100644 --- a/ci/deps/actions-38-slow.yaml +++ b/ci/deps/actions-38-slow.yaml @@ -13,7 +13,7 @@ dependencies: # pandas dependencies - beautifulsoup4 - - fsspec>=0.7.4 + - fsspec>=0.7.4, <2021.6.0 - html5lib - lxml - matplotlib diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index b74f1af8ee0f6..03f2bc84bcc01 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -12,11 +12,30 @@ dependencies: - hypothesis>=3.58.0 # pandas dependencies + - beautifulsoup4 + - bottleneck + - fsspec>=0.8.0, <2021.6.0 + - gcsfs + - html5lib + - jinja2 + - lxml + - matplotlib + - moto>=1.3.14 + - flask + - numexpr - numpy + - openpyxl + - pyarrow + - pytables - python-dateutil - pytz - - # optional dependencies - - pytables + - s3fs>=0.4.2 - scipy - - pyarrow=1.0 + - sqlalchemy + - xlrd + - xlsxwriter + - xlwt + - pyreadstat + - pip + - pip: + - pyxlsb diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 5cbc029f8c03d..4df55813ea21c 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - beautifulsoup4 - bottleneck - - fsspec>=0.8.0 + - fsspec>=0.8.0, <2021.6.0 - gcsfs>=0.6.0 - html5lib - jinja2 diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 7fdecae626f9d..70aa46e8a5851 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -17,7 +17,7 @@ dependencies: - bottleneck - fastparquet>=0.4.0 - flask - - fsspec>=0.8.0 + - fsspec>=0.8.0, <2021.6.0 - matplotlib=3.1.3 - moto>=1.3.14 - numba diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 0d6f26d8c29f8..ed1c5dc873092 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -19,7 +19,7 @@ if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then XVFB="xvfb-run " fi -PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE pandas" +PYTEST_CMD="${XVFB}pytest -m \"$PATTERN\" -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ $(uname) != "Linux" && $(uname) != "Darwin" ]]; then # GH#37455 windows py38 build appears to be running out of memory diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index bc0a3556b9ac1..7b52bf760b601 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -189,11 +189,8 @@ Creating a Python environment (pip) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ If you aren't using conda for your development environment, follow these instructions. -You'll need to have at least the :ref:`minimum Python version ` that pandas supports. If your Python version -is 3.8.0 (or later), you might need to update your ``setuptools`` to version 42.0.0 (or later) -in your development environment before installing the build dependencies:: - - pip install --upgrade setuptools +You'll need to have at least the :ref:`minimum Python version ` that pandas supports. +You also need to have ``setuptools`` 51.0.0 or later to build pandas. **Unix**/**macOS with virtualenv** diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index ee061e7b7d3e6..e58779c090d8f 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -445,6 +445,12 @@ provides a familiar ``DataFrame`` interface for out-of-core, parallel and distri Dask-ML enables parallel and distributed machine learning using Dask alongside existing machine learning libraries like Scikit-Learn, XGBoost, and TensorFlow. +`Ibis `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Ibis offers a standard way to write analytics code, that can be run in multiple engines. It helps in bridging the gap between local Python environments (like pandas) and remote storage and execution systems like Hadoop components (like HDFS, Impala, Hive, Spark) and SQL databases (Postgres, etc.). + + `Koalas `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/reference/io.rst b/doc/source/reference/io.rst index 442631de50c7a..82d4ec4950ef1 100644 --- a/doc/source/reference/io.rst +++ b/doc/source/reference/io.rst @@ -13,6 +13,7 @@ Pickling :toctree: api/ read_pickle + DataFrame.to_pickle Flat file ~~~~~~~~~ @@ -21,6 +22,7 @@ Flat file read_table read_csv + DataFrame.to_csv read_fwf Clipboard @@ -29,6 +31,7 @@ Clipboard :toctree: api/ read_clipboard + DataFrame.to_clipboard Excel ~~~~~ @@ -36,23 +39,33 @@ Excel :toctree: api/ read_excel + DataFrame.to_excel ExcelFile.parse +.. currentmodule:: pandas.io.formats.style + +.. autosummary:: + :toctree: api/ + + Styler.to_excel + +.. currentmodule:: pandas + .. autosummary:: :toctree: api/ :template: autosummary/class_without_autosummary.rst ExcelWriter +.. currentmodule:: pandas.io.json + JSON ~~~~ .. autosummary:: :toctree: api/ read_json - json_normalize - -.. currentmodule:: pandas.io.json + to_json .. autosummary:: :toctree: api/ @@ -67,6 +80,16 @@ HTML :toctree: api/ read_html + DataFrame.to_html + +.. currentmodule:: pandas.io.formats.style + +.. autosummary:: + :toctree: api/ + + Styler.to_html + +.. currentmodule:: pandas XML ~~~~ @@ -74,6 +97,23 @@ XML :toctree: api/ read_xml + DataFrame.to_xml + +Latex +~~~~~ +.. autosummary:: + :toctree: api/ + + DataFrame.to_latex + +.. currentmodule:: pandas.io.formats.style + +.. autosummary:: + :toctree: api/ + + Styler.to_latex + +.. currentmodule:: pandas HDFStore: PyTables (HDF5) ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -101,6 +141,7 @@ Feather :toctree: api/ read_feather + DataFrame.to_feather Parquet ~~~~~~~ @@ -108,6 +149,7 @@ Parquet :toctree: api/ read_parquet + DataFrame.to_parquet ORC ~~~ @@ -138,6 +180,7 @@ SQL read_sql_table read_sql_query read_sql + DataFrame.to_sql Google BigQuery ~~~~~~~~~~~~~~~ @@ -152,6 +195,7 @@ STATA :toctree: api/ read_stata + DataFrame.to_stata .. currentmodule:: pandas.io.stata diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 5a2ff803f0323..7b790daea37ff 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -24,6 +24,8 @@ Styler properties Styler.env Styler.template_html + Styler.template_html_style + Styler.template_html_table Styler.template_latex Styler.loader @@ -34,13 +36,15 @@ Style application Styler.apply Styler.applymap - Styler.where Styler.format + Styler.hide_index + Styler.hide_columns Styler.set_td_classes Styler.set_table_styles Styler.set_table_attributes Styler.set_tooltips Styler.set_caption + Styler.set_sticky Styler.set_properties Styler.set_uuid Styler.clear diff --git a/doc/source/user_guide/boolean.rst b/doc/source/user_guide/boolean.rst index 76c922fcef638..54c67674b890c 100644 --- a/doc/source/user_guide/boolean.rst +++ b/doc/source/user_guide/boolean.rst @@ -12,6 +12,11 @@ Nullable Boolean data type ************************** +.. note:: + + BooleanArray is currently experimental. Its API or implementation may + change without warning. + .. versionadded:: 1.0.0 diff --git a/doc/source/user_guide/enhancingperf.rst b/doc/source/user_guide/enhancingperf.rst index aa9a1ba6d6bf0..c78d972f33d65 100644 --- a/doc/source/user_guide/enhancingperf.rst +++ b/doc/source/user_guide/enhancingperf.rst @@ -302,28 +302,63 @@ For more about ``boundscheck`` and ``wraparound``, see the Cython docs on .. _enhancingperf.numba: -Using Numba ------------ +Numba (JIT compilation) +----------------------- -A recent alternative to statically compiling Cython code, is to use a *dynamic jit-compiler*, Numba. +An alternative to statically compiling Cython code is to use a dynamic just-in-time (JIT) compiler with `Numba `__. -Numba gives you the power to speed up your applications with high performance functions written directly in Python. With a few annotations, array-oriented and math-heavy Python code can be just-in-time compiled to native machine instructions, similar in performance to C, C++ and Fortran, without having to switch languages or Python interpreters. +Numba allows you to write a pure Python function which can be JIT compiled to native machine instructions, similar in performance to C, C++ and Fortran, +by decorating your function with ``@jit``. -Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool). Numba supports compilation of Python to run on either CPU or GPU hardware, and is designed to integrate with the Python scientific software stack. +Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool). +Numba supports compilation of Python to run on either CPU or GPU hardware and is designed to integrate with the Python scientific software stack. .. note:: - You will need to install Numba. This is easy with ``conda``, by using: ``conda install numba``, see :ref:`installing using miniconda`. + The ``@jit`` compilation will add overhead to the runtime of the function, so performance benefits may not be realized especially when using small data sets. + Consider `caching `__ your function to avoid compilation overhead each time your function is run. -.. note:: +Numba can be used in 2 ways with pandas: + +#. Specify the ``engine="numba"`` keyword in select pandas methods +#. Define your own Python function decorated with ``@jit`` and pass the underlying NumPy array of :class:`Series` or :class:`Dataframe` (using ``to_numpy()``) into the function + +pandas Numba Engine +~~~~~~~~~~~~~~~~~~~ + +If Numba is installed, one can specify ``engine="numba"`` in select pandas methods to execute the method using Numba. +Methods that support ``engine="numba"`` will also have an ``engine_kwargs`` keyword that accepts a dictionary that allows one to specify +``"nogil"``, ``"nopython"`` and ``"parallel"`` keys with boolean values to pass into the ``@jit`` decorator. +If ``engine_kwargs`` is not specified, it defaults to ``{"nogil": False, "nopython": True, "parallel": False}`` unless otherwise specified. + +In terms of performance, **the first time a function is run using the Numba engine will be slow** +as Numba will have some function compilation overhead. However, the JIT compiled functions are cached, +and subsequent calls will be fast. In general, the Numba engine is performant with +a larger amount of data points (e.g. 1+ million). - As of Numba version 0.20, pandas objects cannot be passed directly to Numba-compiled functions. Instead, one must pass the NumPy array underlying the pandas object to the Numba-compiled function as demonstrated below. +.. code-block:: ipython + + In [1]: data = pd.Series(range(1_000_000)) # noqa: E225 + + In [2]: roll = data.rolling(10) -Jit -~~~ + In [3]: def f(x): + ...: return np.sum(x) + 5 + # Run the first time, compilation time will affect performance + In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) + 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) + # Function is cached and performance will improve + In [5]: %timeit roll.apply(f, engine='numba', raw=True) + 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) -We demonstrate how to use Numba to just-in-time compile our code. We simply -take the plain Python code from above and annotate with the ``@jit`` decorator. + In [6]: %timeit roll.apply(f, engine='cython', raw=True) + 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + +Custom Function Examples +~~~~~~~~~~~~~~~~~~~~~~~~ + +A custom Python function decorated with ``@jit`` can be used with pandas objects by passing their NumPy array +representations with ``to_numpy()``. .. code-block:: python @@ -360,8 +395,6 @@ take the plain Python code from above and annotate with the ``@jit`` decorator. ) return pd.Series(result, index=df.index, name="result") -Note that we directly pass NumPy arrays to the Numba function. ``compute_numba`` is just a wrapper that provides a -nicer interface by passing/returning pandas objects. .. code-block:: ipython @@ -370,19 +403,9 @@ nicer interface by passing/returning pandas objects. In this example, using Numba was faster than Cython. -Numba as an argument -~~~~~~~~~~~~~~~~~~~~ - -Additionally, we can leverage the power of `Numba `__ -by calling it as an argument in :meth:`~Rolling.apply`. See :ref:`Computation tools -` for an extensive example. - -Vectorize -~~~~~~~~~ - Numba can also be used to write vectorized functions that do not require the user to explicitly loop over the observations of a vector; a vectorized function will be applied to each row automatically. -Consider the following toy example of doubling each observation: +Consider the following example of doubling each observation: .. code-block:: python @@ -414,25 +437,23 @@ Consider the following toy example of doubling each observation: Caveats ~~~~~~~ -.. note:: - - Numba will execute on any function, but can only accelerate certain classes of functions. - Numba is best at accelerating functions that apply numerical functions to NumPy -arrays. When passed a function that only uses operations it knows how to -accelerate, it will execute in ``nopython`` mode. - -If Numba is passed a function that includes something it doesn't know how to -work with -- a category that currently includes sets, lists, dictionaries, or -string functions -- it will revert to ``object mode``. In ``object mode``, -Numba will execute but your code will not speed up significantly. If you would +arrays. If you try to ``@jit`` a function that contains unsupported `Python `__ +or `NumPy `__ +code, compilation will revert `object mode `__ which +will mostly likely not speed up your function. If you would prefer that Numba throw an error if it cannot compile a function in a way that speeds up your code, pass Numba the argument -``nopython=True`` (e.g. ``@numba.jit(nopython=True)``). For more on +``nopython=True`` (e.g. ``@jit(nopython=True)``). For more on troubleshooting Numba modes, see the `Numba troubleshooting page `__. -Read more in the `Numba docs `__. +Using ``parallel=True`` (e.g. ``@jit(parallel=True)``) may result in a ``SIGABRT`` if the threading layer leads to unsafe +behavior. You can first `specify a safe threading layer `__ +before running a JIT function with ``parallel=True``. + +Generally if the you encounter a segfault (``SIGSEGV``) while using Numba, please report the issue +to the `Numba issue tracker. `__ .. _enhancingperf.eval: diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index 870ec6763c72f..90d8ec4f95727 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1106,11 +1106,9 @@ Numba Accelerated Routines .. versionadded:: 1.1 If `Numba `__ is installed as an optional dependency, the ``transform`` and -``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments. The ``engine_kwargs`` -argument is a dictionary of keyword arguments that will be passed into the -`numba.jit decorator `__. -These keyword arguments will be applied to the passed function. Currently only ``nogil``, ``nopython``, -and ``parallel`` are supported, and their default values are set to ``False``, ``True`` and ``False`` respectively. +``aggregate`` methods support ``engine='numba'`` and ``engine_kwargs`` arguments. +See :ref:`enhancing performance with Numba ` for general usage of the arguments +and performance considerations. The function signature must start with ``values, index`` **exactly** as the data belonging to each group will be passed into ``values``, and the group index will be passed into ``index``. @@ -1121,52 +1119,6 @@ will be passed into ``values``, and the group index will be passed into ``index` data and group index will be passed as NumPy arrays to the JITed user defined function, and no alternative execution attempts will be tried. -.. note:: - - In terms of performance, **the first time a function is run using the Numba engine will be slow** - as Numba will have some function compilation overhead. However, the compiled functions are cached, - and subsequent calls will be fast. In general, the Numba engine is performant with - a larger amount of data points (e.g. 1+ million). - -.. code-block:: ipython - - In [1]: N = 10 ** 3 - - In [2]: data = {0: [str(i) for i in range(100)] * N, 1: list(range(100)) * N} - - In [3]: df = pd.DataFrame(data, columns=[0, 1]) - - In [4]: def f_numba(values, index): - ...: total = 0 - ...: for i, value in enumerate(values): - ...: if i % 2: - ...: total += value + 5 - ...: else: - ...: total += value * 2 - ...: return total - ...: - - In [5]: def f_cython(values): - ...: total = 0 - ...: for i, value in enumerate(values): - ...: if i % 2: - ...: total += value + 5 - ...: else: - ...: total += value * 2 - ...: return total - ...: - - In [6]: groupby = df.groupby(0) - # Run the first time, compilation time will affect performance - In [7]: %timeit -r 1 -n 1 groupby.aggregate(f_numba, engine='numba') # noqa: E225 - 2.14 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) - # Function is cached and performance will improve - In [8]: %timeit groupby.aggregate(f_numba, engine='numba') - 4.93 ms ± 32.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - In [9]: %timeit groupby.aggregate(f_cython, engine='cython') - 18.6 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - Other useful features --------------------- diff --git a/doc/source/user_guide/indexing.rst b/doc/source/user_guide/indexing.rst index dc66303a44f53..1ab26cf758b77 100644 --- a/doc/source/user_guide/indexing.rst +++ b/doc/source/user_guide/indexing.rst @@ -1523,8 +1523,8 @@ Looking up values by index/column labels ---------------------------------------- Sometimes you want to extract a set of values given a sequence of row labels -and column labels, this can be achieved by ``DataFrame.melt`` combined by filtering the corresponding -rows with ``DataFrame.loc``. For instance: +and column labels, this can be achieved by ``pandas.factorize`` and NumPy indexing. +For instance: .. ipython:: python @@ -1532,9 +1532,8 @@ rows with ``DataFrame.loc``. For instance: 'A': [80, 23, np.nan, 22], 'B': [80, 55, 76, 67]}) df - melt = df.melt('col') - melt = melt.loc[melt['col'] == melt['variable'], 'value'] - melt.reset_index(drop=True) + idx, cols = pd.factorize(df['col']) + df.reindex(cols, axis=1).to_numpy()[np.arange(len(df)), idx] Formerly this could be achieved with the dedicated ``DataFrame.lookup`` method which was deprecated in version 1.2.0. diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index c2b030d732ba9..1b28aa2900f65 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -5526,13 +5526,23 @@ below and the SQLAlchemy `documentation `__ +for an explanation of how the database connection is handled. .. code-block:: python with engine.connect() as conn, conn.begin(): data = pd.read_sql_table("data", conn) +.. warning:: + + When you open a connection to a database you are also responsible for closing it. + Side effects of leaving a connection open may include locking the database or + other breaking behaviour. + Writing DataFrames '''''''''''''''''' diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 7d8d8e90dfbda..cc499204318c1 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -152,7 +152,7 @@ "\n", "Before adding styles it is useful to show that the [Styler][styler] can distinguish the *display* value from the *actual* value. To control the display value, the text is printed in each cell, and we can use the [.format()][formatfunc] method to manipulate this according to a [format spec string][format] or a callable that takes a single value and returns a string. It is possible to define this for the whole table or for individual columns. \n", "\n", - "Additionally, the format function has a **precision** argument to specifically help formatting floats, an **na_rep** argument to display missing data, and an **escape** argument to help displaying safe-HTML. The default formatter is configured to adopt pandas' regular `display.precision` option, controllable using `with pd.option_context('display.precision', 2):`\n", + "Additionally, the format function has a **precision** argument to specifically help formatting floats, as well as **decimal** and **thousands** separators to support other locales, an **na_rep** argument to display missing data, and an **escape** argument to help displaying safe-HTML or safe-LaTeX. The default formatter is configured to adopt pandas' regular `display.precision` option, controllable using `with pd.option_context('display.precision', 2):`\n", "\n", "Here is an example of using the multiple options to control the formatting generally and with specific column formatters.\n", "\n", @@ -167,9 +167,9 @@ "metadata": {}, "outputs": [], "source": [ - "df.style.format(precision=0, na_rep='MISSING', \n", + "df.style.format(precision=0, na_rep='MISSING', thousands=\" \",\n", " formatter={('Decision Tree', 'Tumour'): \"{:.2f}\",\n", - " ('Regression', 'Non-Tumour'): lambda x: \"$ {:,.1f}\".format(x*-1e3)\n", + " ('Regression', 'Non-Tumour'): lambda x: \"$ {:,.1f}\".format(x*-1e6)\n", " })" ] }, @@ -179,9 +179,11 @@ "source": [ "### Hiding Data\n", "\n", - "The index can be hidden from rendering by calling [.hide_index()][hideidx], which might be useful if your index is integer based.\n", + "The index and column headers can be completely hidden, as well subselecting rows or columns that one wishes to exclude. Both these options are performed using the same methods.\n", "\n", - "Columns can be hidden from rendering by calling [.hide_columns()][hidecols] and passing in the name of a column, or a slice of columns.\n", + "The index can be hidden from rendering by calling [.hide_index()][hideidx] without any arguments, which might be useful if your index is integer based. Similarly column headers can be hidden by calling [.hide_columns()][hidecols] without any arguments.\n", + "\n", + "Specific rows or columns can be hidden from rendering by calling the same [.hide_index()][hideidx] or [.hide_columns()][hidecols] methods and passing in a row/column label, a list-like or a slice of row/column labels to for the ``subset`` argument.\n", "\n", "Hiding does not change the integer arrangement of CSS classes, e.g. hiding the first two columns of a DataFrame means the column class indexing will start at `col2`, since `col0` and `col1` are simply ignored.\n", "\n", @@ -1403,7 +1405,9 @@ "source": [ "### Sticky Headers\n", "\n", - "If you display a large matrix or DataFrame in a notebook, but you want to always see the column and row headers you can use the following CSS to make them stick. We might make this into an API function later." + "If you display a large matrix or DataFrame in a notebook, but you want to always see the column and row headers you can use the [.set_sticky][sticky] method which manipulates the table styles CSS.\n", + "\n", + "[sticky]: ../reference/api/pandas.io.formats.style.Styler.set_sticky.rst" ] }, { @@ -1412,20 +1416,15 @@ "metadata": {}, "outputs": [], "source": [ - "bigdf = pd.DataFrame(np.random.randn(15, 100))\n", - "bigdf.style.set_table_styles([\n", - " {'selector': 'thead th', 'props': 'position: sticky; top:0; background-color:salmon;'},\n", - " {'selector': 'tbody th', 'props': 'position: sticky; left:0; background-color:lightgreen;'} \n", - "])" + "bigdf = pd.DataFrame(np.random.randn(16, 100))\n", + "bigdf.style.set_sticky(axis=\"index\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Hiding Headers\n", - "\n", - "We don't yet have any API to hide headers so a quick fix is:" + "It is also possible to stick MultiIndexes and even only specific levels." ] }, { @@ -1434,7 +1433,8 @@ "metadata": {}, "outputs": [], "source": [ - "df3.style.set_table_styles([{'selector': 'thead tr', 'props': 'display: none;'}]) # or 'thead th'" + "bigdf.index = pd.MultiIndex.from_product([[\"A\",\"B\"],[0,1],[0,1,2,3]])\n", + "bigdf.style.set_sticky(axis=\"index\", pixel_size=18, levels=[1,2])" ] }, { @@ -1524,6 +1524,17 @@ "![Excel spreadsheet with styled DataFrame](../_static/style-excel.png)\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Export to LaTeX\n", + "\n", + "There is support (*since version 1.3.0*) to export `Styler` to LaTeX. The documentation for the [.to_latex][latex] method gives further detail and numerous examples.\n", + "\n", + "[latex]: ../reference/api/pandas.io.formats.style.Styler.to_latex.rst" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1769,7 +1780,7 @@ " Styler.loader, # the default\n", " ])\n", " )\n", - " template_html = env.get_template(\"myhtml.tpl\")" + " template_html_table = env.get_template(\"myhtml.tpl\")" ] }, { @@ -1822,14 +1833,35 @@ "outputs": [], "source": [ "EasyStyler = Styler.from_custom_template(\"templates\", \"myhtml.tpl\")\n", - "EasyStyler(df3)" + "HTML(EasyStyler(df3).render(table_title=\"Another Title\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Here's the template structure:" + "#### Template Structure\n", + "\n", + "Here's the template structure for the both the style generation template and the table generation template:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Style template:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "with open(\"templates/html_style_structure.html\") as f:\n", + " style_structure = f.read()" ] }, { @@ -1838,10 +1870,35 @@ "metadata": {}, "outputs": [], "source": [ - "with open(\"templates/template_structure.html\") as f:\n", - " structure = f.read()\n", - " \n", - "HTML(structure)" + "HTML(style_structure)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Table template:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "nbsphinx": "hidden" + }, + "outputs": [], + "source": [ + "with open(\"templates/html_table_structure.html\") as f:\n", + " table_structure = f.read()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "HTML(table_structure)" ] }, { diff --git a/doc/source/user_guide/templates/html_style_structure.html b/doc/source/user_guide/templates/html_style_structure.html new file mode 100644 index 0000000000000..dc0c03ac363a9 --- /dev/null +++ b/doc/source/user_guide/templates/html_style_structure.html @@ -0,0 +1,35 @@ + + + +
before_style
+
style +
<style type="text/css">
+
table_styles
+
before_cellstyle
+
cellstyle
+
</style>
+
diff --git a/doc/source/user_guide/templates/template_structure.html b/doc/source/user_guide/templates/html_table_structure.html similarity index 80% rename from doc/source/user_guide/templates/template_structure.html rename to doc/source/user_guide/templates/html_table_structure.html index 0778d8e2e6f18..e03f9591d2a35 100644 --- a/doc/source/user_guide/templates/template_structure.html +++ b/doc/source/user_guide/templates/html_table_structure.html @@ -25,15 +25,6 @@ } -
before_style
-
style -
<style type="text/css">
-
table_styles
-
before_cellstyle
-
cellstyle
-
</style>
-
-
before_table
table diff --git a/doc/source/user_guide/templates/myhtml.tpl b/doc/source/user_guide/templates/myhtml.tpl index 1170fd3def653..1e204d0bd4568 100644 --- a/doc/source/user_guide/templates/myhtml.tpl +++ b/doc/source/user_guide/templates/myhtml.tpl @@ -1,4 +1,4 @@ -{% extends "html.tpl" %} +{% extends "html_table.tpl" %} {% block table %}

{{ table_title|default("My Table") }}

{{ super() }} diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index 0d6dcaa3726e6..3e533cbadc5f7 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -262,26 +262,24 @@ and we want to use an expanding window where ``use_expanding`` is ``True`` other .. code-block:: ipython In [2]: from pandas.api.indexers import BaseIndexer - ...: - ...: class CustomIndexer(BaseIndexer): - ...: - ...: def get_window_bounds(self, num_values, min_periods, center, closed): - ...: start = np.empty(num_values, dtype=np.int64) - ...: end = np.empty(num_values, dtype=np.int64) - ...: for i in range(num_values): - ...: if self.use_expanding[i]: - ...: start[i] = 0 - ...: end[i] = i + 1 - ...: else: - ...: start[i] = i - ...: end[i] = i + self.window_size - ...: return start, end - ...: - - In [3]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) - - In [4]: df.rolling(indexer).sum() - Out[4]: + + In [3]: class CustomIndexer(BaseIndexer): + ...: def get_window_bounds(self, num_values, min_periods, center, closed): + ...: start = np.empty(num_values, dtype=np.int64) + ...: end = np.empty(num_values, dtype=np.int64) + ...: for i in range(num_values): + ...: if self.use_expanding[i]: + ...: start[i] = 0 + ...: end[i] = i + 1 + ...: else: + ...: start[i] = i + ...: end[i] = i + self.window_size + ...: return start, end + + In [4]: indexer = CustomIndexer(window_size=1, use_expanding=use_expanding) + + In [5]: df.rolling(indexer).sum() + Out[5]: values 0 0.0 1 1.0 @@ -365,45 +363,21 @@ Numba engine Additionally, :meth:`~Rolling.apply` can leverage `Numba `__ if installed as an optional dependency. The apply aggregation can be executed using Numba by specifying ``engine='numba'`` and ``engine_kwargs`` arguments (``raw`` must also be set to ``True``). +See :ref:`enhancing performance with Numba ` for general usage of the arguments and performance considerations. + Numba will be applied in potentially two routines: #. If ``func`` is a standard Python function, the engine will `JIT `__ the passed function. ``func`` can also be a JITed function in which case the engine will not JIT the function again. #. The engine will JIT the for loop where the apply function is applied to each window. -.. versionadded:: 1.3.0 - -``mean``, ``median``, ``max``, ``min``, and ``sum`` also support the ``engine`` and ``engine_kwargs`` arguments. - The ``engine_kwargs`` argument is a dictionary of keyword arguments that will be passed into the `numba.jit decorator `__. These keyword arguments will be applied to *both* the passed function (if a standard Python function) -and the apply for loop over each window. Currently only ``nogil``, ``nopython``, and ``parallel`` are supported, -and their default values are set to ``False``, ``True`` and ``False`` respectively. - -.. note:: +and the apply for loop over each window. - In terms of performance, **the first time a function is run using the Numba engine will be slow** - as Numba will have some function compilation overhead. However, the compiled functions are cached, - and subsequent calls will be fast. In general, the Numba engine is performant with - a larger amount of data points (e.g. 1+ million). - -.. code-block:: ipython - - In [1]: data = pd.Series(range(1_000_000)) - - In [2]: roll = data.rolling(10) - - In [3]: def f(x): - ...: return np.sum(x) + 5 - # Run the first time, compilation time will affect performance - In [4]: %timeit -r 1 -n 1 roll.apply(f, engine='numba', raw=True) # noqa: E225, E999 - 1.23 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each) - # Function is cached and performance will improve - In [5]: %timeit roll.apply(f, engine='numba', raw=True) - 188 ms ± 1.93 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) +.. versionadded:: 1.3.0 - In [6]: %timeit roll.apply(f, engine='cython', raw=True) - 3.92 s ± 59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) +``mean``, ``median``, ``max``, ``min``, and ``sum`` also support the ``engine`` and ``engine_kwargs`` arguments. .. _window.cov_corr: diff --git a/doc/source/whatsnew/index.rst b/doc/source/whatsnew/index.rst index 986cf43b80494..ee2f44bd9e981 100644 --- a/doc/source/whatsnew/index.rst +++ b/doc/source/whatsnew/index.rst @@ -16,6 +16,9 @@ Version 1.3 .. toctree:: :maxdepth: 2 + v1.3.3 + v1.3.2 + v1.3.1 v1.3.0 Version 1.2 diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index b87274307431b..03dfe475475a1 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -338,19 +338,20 @@ maps labels to their new names along the default axis, is allowed to be passed b *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df = pd.DataFrame([[1]]) - >>> df.rename({0: 1}, {0: 2}) + In [1]: df = pd.DataFrame([[1]]) + In [2]: df.rename({0: 1}, {0: 2}) + Out[2]: FutureWarning: ...Use named arguments to resolve ambiguity... 2 1 1 *pandas 1.0.0* -.. code-block:: python +.. code-block:: ipython - >>> df.rename({0: 1}, {0: 2}) + In [3]: df.rename({0: 1}, {0: 2}) Traceback (most recent call last): ... TypeError: rename() takes from 1 to 2 positional arguments but 3 were given @@ -359,26 +360,28 @@ Note that errors will now be raised when conflicting or potentially ambiguous ar *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df.rename({0: 1}, index={0: 2}) + In [4]: df.rename({0: 1}, index={0: 2}) + Out[4]: 0 1 1 - >>> df.rename(mapper={0: 1}, index={0: 2}) + In [5]: df.rename(mapper={0: 1}, index={0: 2}) + Out[5]: 0 2 1 *pandas 1.0.0* -.. code-block:: python +.. code-block:: ipython - >>> df.rename({0: 1}, index={0: 2}) + In [6]: df.rename({0: 1}, index={0: 2}) Traceback (most recent call last): ... TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns' - >>> df.rename(mapper={0: 1}, index={0: 2}) + In [7]: df.rename(mapper={0: 1}, index={0: 2}) Traceback (most recent call last): ... TypeError: Cannot specify both 'mapper' and any of 'index' or 'columns' @@ -405,12 +408,12 @@ Extended verbose info output for :class:`~pandas.DataFrame` *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df = pd.DataFrame({"int_col": [1, 2, 3], + In [1]: df = pd.DataFrame({"int_col": [1, 2, 3], ... "text_col": ["a", "b", "c"], ... "float_col": [0.0, 0.1, 0.2]}) - >>> df.info(verbose=True) + In [2]: df.info(verbose=True) RangeIndex: 3 entries, 0 to 2 Data columns (total 3 columns): @@ -440,14 +443,16 @@ Extended verbose info output for :class:`~pandas.DataFrame` *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> pd.array(["a", None]) + In [1]: pd.array(["a", None]) + Out[1]: ['a', None] Length: 2, dtype: object - >>> pd.array([1, None]) + In [2]: pd.array([1, None]) + Out[2]: [1, None] Length: 2, dtype: object @@ -470,15 +475,17 @@ As a reminder, you can specify the ``dtype`` to disable all inference. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> a = pd.array([1, 2, None], dtype="Int64") - >>> a + In [1]: a = pd.array([1, 2, None], dtype="Int64") + In [2]: a + Out[2]: [1, 2, NaN] Length: 3, dtype: Int64 - >>> a[2] + In [3]: a[2] + Out[3]: nan *pandas 1.0.0* @@ -499,9 +506,10 @@ will now raise. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> np.asarray(a, dtype="float") + In [1]: np.asarray(a, dtype="float") + Out[1]: array([ 1., 2., nan]) *pandas 1.0.0* @@ -525,9 +533,10 @@ will now be ``pd.NA`` instead of ``np.nan`` in presence of missing values *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> pd.Series(a).sum(skipna=False) + In [1]: pd.Series(a).sum(skipna=False) + Out[1]: nan *pandas 1.0.0* @@ -543,9 +552,10 @@ integer dtype for the values. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + In [1]: pd.Series([2, 1, 1, None], dtype="Int64").value_counts().dtype + Out[1]: dtype('int64') *pandas 1.0.0* @@ -565,15 +575,17 @@ Comparison operations on a :class:`arrays.IntegerArray` now returns a *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> a = pd.array([1, 2, None], dtype="Int64") - >>> a + In [1]: a = pd.array([1, 2, None], dtype="Int64") + In [2]: a + Out[2]: [1, 2, NaN] Length: 3, dtype: Int64 - >>> a > 1 + In [3]: a > 1 + Out[3]: array([False, True, False]) *pandas 1.0.0* @@ -640,9 +652,10 @@ scalar values in the result are instances of the extension dtype's scalar type. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df.resample("2D").agg(lambda x: 'a').A.dtype + In [1]> df.resample("2D").agg(lambda x: 'a').A.dtype + Out[1]: CategoricalDtype(categories=['a', 'b'], ordered=False) *pandas 1.0.0* @@ -657,9 +670,10 @@ depending on how the results are cast back to the original dtype. *pandas 0.25.x* -.. code-block:: python +.. code-block:: ipython - >>> df.resample("2D").agg(lambda x: 'c') + In [1] df.resample("2D").agg(lambda x: 'c') + Out[1]: A 0 NaN @@ -871,10 +885,10 @@ matplotlib directly rather than :meth:`~DataFrame.plot`. To use pandas formatters with a matplotlib plot, specify -.. code-block:: python +.. code-block:: ipython - >>> import pandas as pd - >>> pd.options.plotting.matplotlib.register_converters = True + In [1]: import pandas as pd + In [2]: pd.options.plotting.matplotlib.register_converters = True Note that plots created by :meth:`DataFrame.plot` and :meth:`Series.plot` *do* register the converters automatically. The only behavior change is when plotting a date-like object via ``matplotlib.pyplot.plot`` diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index bfe30d52e2aff..34e28eab6d4bf 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -52,20 +52,23 @@ DataFrame / Series combination) would ignore the indices, only match the inputs by shape, and use the index/columns of the first DataFrame for the result: -.. code-block:: python +.. code-block:: ipython - >>> df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1]) - ... df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2]) - >>> df1 + In [1]: df1 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[0, 1]) + In [2]: df2 = pd.DataFrame({"a": [1, 2], "b": [3, 4]}, index=[1, 2]) + In [3]: df1 + Out[3]: a b 0 1 3 1 2 4 - >>> df2 + In [4]: df2 + Out[4]: a b 1 1 3 2 2 4 - >>> np.add(df1, df2) + In [5]: np.add(df1, df2) + Out[5]: a b 0 2 6 1 4 8 @@ -73,9 +76,10 @@ the result: This contrasts with how other pandas operations work, which first align the inputs: -.. code-block:: python +.. code-block:: ipython - >>> df1 + df2 + In [6]: df1 + df2 + Out[6]: a b 0 NaN NaN 1 3.0 7.0 @@ -94,9 +98,10 @@ objects (eg ``np.add(s1, s2)``) already aligns and continues to do so. To avoid the warning and keep the current behaviour of ignoring the indices, convert one of the arguments to a NumPy array: -.. code-block:: python +.. code-block:: ipython - >>> np.add(df1, np.asarray(df2)) + In [7]: np.add(df1, np.asarray(df2)) + Out[7]: a b 0 2 6 1 4 8 @@ -104,10 +109,11 @@ convert one of the arguments to a NumPy array: To obtain the future behaviour and silence the warning, you can align manually before passing the arguments to the ufunc: -.. code-block:: python +.. code-block:: ipython - >>> df1, df2 = df1.align(df2) - >>> np.add(df1, df2) + In [8]: df1, df2 = df1.align(df2) + In [9]: np.add(df1, df2) + Out[9]: a b 0 NaN NaN 1 3.0 7.0 diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index d0af23b48b1f7..d3ceb2b919b5d 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -1,7 +1,7 @@ .. _whatsnew_125: -What's new in 1.2.5 (May ??, 2021) ----------------------------------- +What's new in 1.2.5 (June 22, 2021) +----------------------------------- These are the changes in pandas 1.2.5. See :ref:`release` for a full changelog including other versions of pandas. @@ -14,32 +14,12 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ -- Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) +- Fixed regression in :func:`concat` between two :class:`DataFrame` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) - Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`) -- Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) -- Regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`) -- Regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`) - -.. --------------------------------------------------------------------------- - - -.. _whatsnew_125.bug_fixes: - -Bug fixes -~~~~~~~~~ - -- -- - -.. --------------------------------------------------------------------------- - -.. _whatsnew_125.other: - -Other -~~~~~ - -- -- +- Fixed regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) +- Fixed regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`) +- Fixed regression in :func:`ExcelFile` when a corrupt file is opened but not closed (:issue:`41778`) +- Fixed regression in :meth:`DataFrame.astype` with ``dtype=str`` failing to convert ``NaN`` in categorical columns (:issue:`41797`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index dd95f9088e3da..ed66861efad93 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -1,7 +1,7 @@ .. _whatsnew_130: -What's new in 1.3.0 (??) ------------------------- +What's new in 1.3.0 (July 2, 2021) +---------------------------------- These are the changes in pandas 1.3.0. See :ref:`release` for a full changelog including other versions of pandas. @@ -124,7 +124,7 @@ which has been revised and improved (:issue:`39720`, :issue:`39317`, :issue:`404 - The methods :meth:`.Styler.highlight_null`, :meth:`.Styler.highlight_min`, and :meth:`.Styler.highlight_max` now allow custom CSS highlighting instead of the default background coloring (:issue:`40242`) - :meth:`.Styler.apply` now accepts functions that return an ``ndarray`` when ``axis=None``, making it now consistent with the ``axis=0`` and ``axis=1`` behavior (:issue:`39359`) - When incorrectly formatted CSS is given via :meth:`.Styler.apply` or :meth:`.Styler.applymap`, an error is now raised upon rendering (:issue:`39660`) - - :meth:`.Styler.format` now accepts the keyword argument ``escape`` for optional HTML and LaTex escaping (:issue:`40388`, :issue:`41619`) + - :meth:`.Styler.format` now accepts the keyword argument ``escape`` for optional HTML and LaTeX escaping (:issue:`40388`, :issue:`41619`) - :meth:`.Styler.background_gradient` has gained the argument ``gmap`` to supply a specific gradient map for shading (:issue:`22727`) - :meth:`.Styler.clear` now clears :attr:`Styler.hidden_index` and :attr:`Styler.hidden_columns` as well (:issue:`40484`) - Added the method :meth:`.Styler.highlight_between` (:issue:`39821`) @@ -136,8 +136,9 @@ which has been revised and improved (:issue:`39720`, :issue:`39317`, :issue:`404 - Many features of the :class:`.Styler` class are now either partially or fully usable on a DataFrame with a non-unique indexes or columns (:issue:`41143`) - One has greater control of the display through separate sparsification of the index or columns using the :ref:`new styler options `, which are also usable via :func:`option_context` (:issue:`41142`) - Added the option ``styler.render.max_elements`` to avoid browser overload when styling large DataFrames (:issue:`40712`) - - Added the method :meth:`.Styler.to_latex` (:issue:`21673`) + - Added the method :meth:`.Styler.to_latex` (:issue:`21673`, :issue:`42320`), which also allows some limited CSS conversion (:issue:`40731`) - Added the method :meth:`.Styler.to_html` (:issue:`13379`) + - Added the method :meth:`.Styler.set_sticky` to make index and column headers permanently visible in scrolling HTML frames (:issue:`29072`) .. _whatsnew_130.enhancements.dataframe_honors_copy_with_dict: @@ -246,11 +247,12 @@ Other enhancements - Improved error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`) - Improved consistency of error messages when passing an invalid ``win_type`` argument in :ref:`Window methods ` (:issue:`15969`) - :func:`read_sql_query` now accepts a ``dtype`` argument to cast the columnar data from the SQL database based on user input (:issue:`10285`) +- :func:`read_csv` now raising ``ParserWarning`` if length of header or given names does not match length of data when ``usecols`` is not specified (:issue:`21768`) - Improved integer type mapping from pandas to SQLAlchemy when using :meth:`DataFrame.to_sql` (:issue:`35076`) - :func:`to_numeric` now supports downcasting of nullable ``ExtensionDtype`` objects (:issue:`33013`) - Added support for dict-like names in :class:`MultiIndex.set_names` and :class:`MultiIndex.rename` (:issue:`20421`) - :func:`read_excel` can now auto-detect .xlsb files and older .xls files (:issue:`35416`, :issue:`41225`) -- :class:`ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behaviour of append mode when writing to existing sheets (:issue:`40230`) +- :class:`ExcelWriter` now accepts an ``if_sheet_exists`` parameter to control the behavior of append mode when writing to existing sheets (:issue:`40230`) - :meth:`.Rolling.sum`, :meth:`.Expanding.sum`, :meth:`.Rolling.mean`, :meth:`.Expanding.mean`, :meth:`.ExponentialMovingWindow.mean`, :meth:`.Rolling.median`, :meth:`.Expanding.median`, :meth:`.Rolling.max`, :meth:`.Expanding.max`, :meth:`.Rolling.min`, and :meth:`.Expanding.min` now support `Numba `_ execution with the ``engine`` keyword (:issue:`38895`, :issue:`41267`) - :meth:`DataFrame.apply` can now accept NumPy unary operators as strings, e.g. ``df.apply("sqrt")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) - :meth:`DataFrame.apply` can now accept non-callable DataFrame properties as strings, e.g. ``df.apply("size")``, which was already the case for :meth:`Series.apply` (:issue:`39116`) @@ -267,12 +269,16 @@ Other enhancements - :meth:`read_csv` and :meth:`read_json` expose the argument ``encoding_errors`` to control how encoding errors are handled (:issue:`39450`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` use Kleene logic with nullable data types (:issue:`37506`) - :meth:`.GroupBy.any` and :meth:`.GroupBy.all` return a ``BooleanDtype`` for columns with nullable data types (:issue:`33449`) +- :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising with ``object`` data containing ``pd.NA`` even when ``skipna=True`` (:issue:`37501`) - :meth:`.GroupBy.rank` now supports object-dtype data (:issue:`38278`) - Constructing a :class:`DataFrame` or :class:`Series` with the ``data`` argument being a Python iterable that is *not* a NumPy ``ndarray`` consisting of NumPy scalars will now result in a dtype with a precision the maximum of the NumPy scalars; this was already the case when ``data`` is a NumPy ``ndarray`` (:issue:`40908`) - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`) - Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`) +- :meth:`Series.between` can now accept ``left`` or ``right`` as arguments to ``inclusive`` to include only the left or right boundary (:issue:`40245`) +- :meth:`DataFrame.explode` now supports exploding multiple columns. Its ``column`` argument now also accepts a list of str or tuples for exploding on multiple columns at the same time (:issue:`39240`) +- :meth:`DataFrame.sample` now accepts the ``ignore_index`` argument to reset the index after sampling, similar to :meth:`DataFrame.drop_duplicates` and :meth:`DataFrame.sort_values` (:issue:`38581`) .. --------------------------------------------------------------------------- @@ -301,7 +307,7 @@ As an example of this, given: original = pd.Series(cat) unique = original.unique() -*pandas < 1.3.0*: +*Previous behavior*: .. code-block:: ipython @@ -311,7 +317,7 @@ As an example of this, given: In [2]: original.dtype == unique.dtype False -*pandas >= 1.3.0* +*New behavior*: .. ipython:: python @@ -333,7 +339,7 @@ Preserve dtypes in :meth:`DataFrame.combine_first` df2 combined = df1.combine_first(df2) -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -344,7 +350,7 @@ Preserve dtypes in :meth:`DataFrame.combine_first` C float64 dtype: object -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -367,7 +373,7 @@ values as measured by ``np.allclose``. Now no such casting occurs. df = pd.DataFrame({'key': [1, 1], 'a': [True, False], 'b': [True, True]}) df -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -377,7 +383,7 @@ values as measured by ``np.allclose``. Now no such casting occurs. key 1 True 2 -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -395,7 +401,7 @@ Now, these methods will always return a float dtype. (:issue:`41137`) df = pd.DataFrame({'a': [True], 'b': [1], 'c': [1.0]}) -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -404,7 +410,7 @@ Now, these methods will always return a float dtype. (:issue:`41137`) a b c 0 True 1 1.0 -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -428,7 +434,7 @@ insert the values into the existing data rather than create an entirely new arra In both the new and old behavior, the data in ``values`` is overwritten, but in the old behavior the dtype of ``df["A"]`` changed to ``int64``. -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -443,7 +449,7 @@ the old behavior the dtype of ``df["A"]`` changed to ``int64``. In pandas 1.3.0, ``df`` continues to share data with ``values`` -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -470,7 +476,7 @@ never casting to the dtypes of the existing arrays. In the old behavior, ``5`` was cast to ``float64`` and inserted into the existing array backing ``df``: -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -480,7 +486,7 @@ array backing ``df``: In the new behavior, we get a new array, and retain an integer-dtyped ``5``: -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -503,7 +509,7 @@ casts to ``dtype=object`` (:issue:`38709`) ser2 = orig.copy() ser2.iloc[1] = 2.0 -*pandas 1.2.x* +*Previous behavior*: .. code-block:: ipython @@ -519,7 +525,7 @@ casts to ``dtype=object`` (:issue:`38709`) 1 2.0 dtype: object -*pandas 1.3.0* +*New behavior*: .. ipython:: python @@ -637,7 +643,7 @@ If installed, we now require: +-----------------+-----------------+----------+---------+ | pytest (dev) | 6.0 | | X | +-----------------+-----------------+----------+---------+ -| mypy (dev) | 0.800 | | X | +| mypy (dev) | 0.812 | | X | +-----------------+-----------------+----------+---------+ | setuptools | 38.6.0 | | X | +-----------------+-----------------+----------+---------+ @@ -701,6 +707,8 @@ Other API changes - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as `turbodbc `_ (:issue:`36893`) - Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`) - :meth:`ExtensionDtype.construct_array_type` is now a required method instead of an optional one for :class:`ExtensionDtype` subclasses (:issue:`24860`) +- Calling ``hash`` on non-hashable pandas objects will now raise ``TypeError`` with the built-in error message (e.g. ``unhashable type: 'Series'``). Previously it would raise a custom message such as ``'Series' objects are mutable, thus they cannot be hashed``. Furthermore, ``isinstance(, abc.collections.Hashable)`` will now return ``False`` (:issue:`40013`) +- :meth:`.Styler.from_custom_template` now has two new arguments for template names, and removed the old ``name``, due to template inheritance having been introducing for better parsing (:issue:`42053`). Subclassing modifications to Styler attributes are also needed. .. _whatsnew_130.api_breaking.build: @@ -714,64 +722,6 @@ Build Deprecations ~~~~~~~~~~~~ -- Deprecated allowing scalars to be passed to the :class:`Categorical` constructor (:issue:`38433`) -- Deprecated constructing :class:`CategoricalIndex` without passing list-like data (:issue:`38944`) -- Deprecated allowing subclass-specific keyword arguments in the :class:`Index` constructor, use the specific subclass directly instead (:issue:`14093`, :issue:`21311`, :issue:`22315`, :issue:`26974`) -- Deprecated the :meth:`astype` method of datetimelike (``timedelta64[ns]``, ``datetime64[ns]``, ``Datetime64TZDtype``, ``PeriodDtype``) to convert to integer dtypes, use ``values.view(...)`` instead (:issue:`38544`) -- Deprecated :meth:`MultiIndex.is_lexsorted` and :meth:`MultiIndex.lexsort_depth`, use :meth:`MultiIndex.is_monotonic_increasing` instead (:issue:`32259`) -- Deprecated keyword ``try_cast`` in :meth:`Series.where`, :meth:`Series.mask`, :meth:`DataFrame.where`, :meth:`DataFrame.mask`; cast results manually if desired (:issue:`38836`) -- Deprecated comparison of :class:`Timestamp` objects with ``datetime.date`` objects. Instead of e.g. ``ts <= mydate`` use ``ts <= pd.Timestamp(mydate)`` or ``ts.date() <= mydate`` (:issue:`36131`) -- Deprecated :attr:`Rolling.win_type` returning ``"freq"`` (:issue:`38963`) -- Deprecated :attr:`Rolling.is_datetimelike` (:issue:`38963`) -- Deprecated :class:`DataFrame` indexer for :meth:`Series.__setitem__` and :meth:`DataFrame.__setitem__` (:issue:`39004`) -- Deprecated :meth:`ExponentialMovingWindow.vol` (:issue:`39220`) -- Using ``.astype`` to convert between ``datetime64[ns]`` dtype and :class:`DatetimeTZDtype` is deprecated and will raise in a future version, use ``obj.tz_localize`` or ``obj.dt.tz_localize`` instead (:issue:`38622`) -- Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) -- Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`) -- Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`) -- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:`read_csv` and :meth:`read_table` in favor of argument ``on_bad_lines`` (:issue:`15122`) -- Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`) -- Deprecated using :func:`merge`, :meth:`DataFrame.merge`, and :meth:`DataFrame.join` on a different number of levels (:issue:`34862`) -- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`) -- Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`) -- Deprecated the ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.set_categories` and will be removed in a future version (:issue:`37643`) -- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`) -- Deprecated setting :attr:`Categorical._codes`, create a new :class:`Categorical` with the desired codes instead (:issue:`40606`) -- Deprecated the ``convert_float`` optional argument in :func:`read_excel` and :meth:`ExcelFile.parse` (:issue:`41127`) -- Deprecated behavior of :meth:`DatetimeIndex.union` with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`) -- Deprecated using ``usecols`` with out of bounds indices for :func:`read_csv` with ``engine="c"`` (:issue:`25623`) -- Deprecated special treatment of lists with first element a Categorical in the :class:`DataFrame` constructor; pass as ``pd.DataFrame({col: categorical, ...})`` instead (:issue:`38845`) -- Deprecated behavior of :class:`DataFrame` constructor when a ``dtype`` is passed and the data cannot be cast to that dtype. In a future version, this will raise instead of being silently ignored (:issue:`24435`) -- Deprecated the :attr:`Timestamp.freq` attribute. For the properties that use it (``is_month_start``, ``is_month_end``, ``is_quarter_start``, ``is_quarter_end``, ``is_year_start``, ``is_year_end``), when you have a ``freq``, use e.g. ``freq.is_month_start(ts)`` (:issue:`15146`) -- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`, :issue:`33401`) -- Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`) -- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`) -- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) -- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`) -- Deprecated passing lists as ``key`` to :meth:`DataFrame.xs` and :meth:`Series.xs` (:issue:`41760`) -- Deprecated passing arguments as positional for all of the following, with exceptions noted (:issue:`41485`): - - :func:`concat` (other than ``objs``) - - :func:`read_csv` (other than ``filepath_or_buffer``) - - :func:`read_table` (other than ``filepath_or_buffer``) - - :meth:`DataFrame.clip` and :meth:`Series.clip` (other than ``upper`` and ``lower``) - - :meth:`DataFrame.drop_duplicates` (except for ``subset``), :meth:`Series.drop_duplicates`, :meth:`Index.drop_duplicates` and :meth:`MultiIndex.drop_duplicates` - - :meth:`DataFrame.drop` (other than ``labels``) and :meth:`Series.drop` - - :meth:`DataFrame.dropna` and :meth:`Series.dropna` - - :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, and :meth:`Series.bfill` - - :meth:`DataFrame.fillna` and :meth:`Series.fillna` (apart from ``value``) - - :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` (other than ``method``) - - :meth:`DataFrame.mask` and :meth:`Series.mask` (other than ``cond`` and ``other``) - - :meth:`DataFrame.reset_index` (other than ``level``) and :meth:`Series.reset_index` - - :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``labels``) - - :meth:`DataFrame.set_index` (other than ``keys``) - - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` - - :meth:`DataFrame.sort_values` (other than ``by``) and :meth:`Series.sort_values` - - :meth:`DataFrame.where` and :meth:`Series.where` (other than ``cond`` and ``other``) - - :meth:`Index.set_names` and :meth:`MultiIndex.set_names` (except for ``names``) - - :meth:`MultiIndex.codes` (except for ``codes``) - - :meth:`MultiIndex.set_levels` (except for ``levels``) - - :meth:`Resampler.interpolate` (other than ``method``) - .. _whatsnew_130.deprecations.nuisance_columns: @@ -840,6 +790,8 @@ For example: 1 2 2 12 +*Future behavior*: + .. code-block:: ipython In [5]: gb.prod(numeric_only=False) @@ -852,6 +804,72 @@ For example: 1 2 2 12 +.. _whatsnew_130.deprecations.other: + +Other Deprecations +^^^^^^^^^^^^^^^^^^ +- Deprecated allowing scalars to be passed to the :class:`Categorical` constructor (:issue:`38433`) +- Deprecated constructing :class:`CategoricalIndex` without passing list-like data (:issue:`38944`) +- Deprecated allowing subclass-specific keyword arguments in the :class:`Index` constructor, use the specific subclass directly instead (:issue:`14093`, :issue:`21311`, :issue:`22315`, :issue:`26974`) +- Deprecated the :meth:`astype` method of datetimelike (``timedelta64[ns]``, ``datetime64[ns]``, ``Datetime64TZDtype``, ``PeriodDtype``) to convert to integer dtypes, use ``values.view(...)`` instead (:issue:`38544`) +- Deprecated :meth:`MultiIndex.is_lexsorted` and :meth:`MultiIndex.lexsort_depth`, use :meth:`MultiIndex.is_monotonic_increasing` instead (:issue:`32259`) +- Deprecated keyword ``try_cast`` in :meth:`Series.where`, :meth:`Series.mask`, :meth:`DataFrame.where`, :meth:`DataFrame.mask`; cast results manually if desired (:issue:`38836`) +- Deprecated comparison of :class:`Timestamp` objects with ``datetime.date`` objects. Instead of e.g. ``ts <= mydate`` use ``ts <= pd.Timestamp(mydate)`` or ``ts.date() <= mydate`` (:issue:`36131`) +- Deprecated :attr:`Rolling.win_type` returning ``"freq"`` (:issue:`38963`) +- Deprecated :attr:`Rolling.is_datetimelike` (:issue:`38963`) +- Deprecated :class:`DataFrame` indexer for :meth:`Series.__setitem__` and :meth:`DataFrame.__setitem__` (:issue:`39004`) +- Deprecated :meth:`ExponentialMovingWindow.vol` (:issue:`39220`) +- Using ``.astype`` to convert between ``datetime64[ns]`` dtype and :class:`DatetimeTZDtype` is deprecated and will raise in a future version, use ``obj.tz_localize`` or ``obj.dt.tz_localize`` instead (:issue:`38622`) +- Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) +- Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favor of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`) +- Deprecated :meth:`.Styler.where` in favor of using an alternative formulation with :meth:`Styler.applymap` (:issue:`40821`) +- Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`) +- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:`read_csv` and :meth:`read_table` in favor of argument ``on_bad_lines`` (:issue:`15122`) +- Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`) +- Deprecated using :func:`merge`, :meth:`DataFrame.merge`, and :meth:`DataFrame.join` on a different number of levels (:issue:`34862`) +- Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`) +- Deprecated the ``level`` keyword for :class:`DataFrame` and :class:`Series` aggregations; use groupby instead (:issue:`39983`) +- Deprecated the ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.set_categories` and will be removed in a future version (:issue:`37643`) +- Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`) +- Deprecated setting :attr:`Categorical._codes`, create a new :class:`Categorical` with the desired codes instead (:issue:`40606`) +- Deprecated the ``convert_float`` optional argument in :func:`read_excel` and :meth:`ExcelFile.parse` (:issue:`41127`) +- Deprecated behavior of :meth:`DatetimeIndex.union` with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`) +- Deprecated using ``usecols`` with out of bounds indices for :func:`read_csv` with ``engine="c"`` (:issue:`25623`) +- Deprecated special treatment of lists with first element a Categorical in the :class:`DataFrame` constructor; pass as ``pd.DataFrame({col: categorical, ...})`` instead (:issue:`38845`) +- Deprecated behavior of :class:`DataFrame` constructor when a ``dtype`` is passed and the data cannot be cast to that dtype. In a future version, this will raise instead of being silently ignored (:issue:`24435`) +- Deprecated the :attr:`Timestamp.freq` attribute. For the properties that use it (``is_month_start``, ``is_month_end``, ``is_quarter_start``, ``is_quarter_end``, ``is_year_start``, ``is_year_end``), when you have a ``freq``, use e.g. ``freq.is_month_start(ts)`` (:issue:`15146`) +- Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`, :issue:`33401`) +- Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`) +- Deprecated behavior of :class:`DataFrame` construction with floating data and integer dtype casting even when lossy; in a future version this will remain floating, matching :class:`Series` behavior (:issue:`41770`) +- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) +- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`) +- Deprecated passing lists as ``key`` to :meth:`DataFrame.xs` and :meth:`Series.xs` (:issue:`41760`) +- Deprecated boolean arguments of ``inclusive`` in :meth:`Series.between` to have ``{"left", "right", "neither", "both"}`` as standard argument values (:issue:`40628`) +- Deprecated passing arguments as positional for all of the following, with exceptions noted (:issue:`41485`): + + - :func:`concat` (other than ``objs``) + - :func:`read_csv` (other than ``filepath_or_buffer``) + - :func:`read_table` (other than ``filepath_or_buffer``) + - :meth:`DataFrame.clip` and :meth:`Series.clip` (other than ``upper`` and ``lower``) + - :meth:`DataFrame.drop_duplicates` (except for ``subset``), :meth:`Series.drop_duplicates`, :meth:`Index.drop_duplicates` and :meth:`MultiIndex.drop_duplicates` + - :meth:`DataFrame.drop` (other than ``labels``) and :meth:`Series.drop` + - :meth:`DataFrame.dropna` and :meth:`Series.dropna` + - :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, and :meth:`Series.bfill` + - :meth:`DataFrame.fillna` and :meth:`Series.fillna` (apart from ``value``) + - :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` (other than ``method``) + - :meth:`DataFrame.mask` and :meth:`Series.mask` (other than ``cond`` and ``other``) + - :meth:`DataFrame.reset_index` (other than ``level``) and :meth:`Series.reset_index` + - :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``labels``) + - :meth:`DataFrame.set_index` (other than ``keys``) + - :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` + - :meth:`DataFrame.sort_values` (other than ``by``) and :meth:`Series.sort_values` + - :meth:`DataFrame.where` and :meth:`Series.where` (other than ``cond`` and ``other``) + - :meth:`Index.set_names` and :meth:`MultiIndex.set_names` (except for ``names``) + - :meth:`MultiIndex.codes` (except for ``codes``) + - :meth:`MultiIndex.set_levels` (except for ``levels``) + - :meth:`Resampler.interpolate` (other than ``method``) + + .. --------------------------------------------------------------------------- @@ -873,7 +891,7 @@ Performance improvements - Performance improvement in :class:`.Styler` where render times are more than 50% reduced and now matches :meth:`DataFrame.to_html` (:issue:`39972` :issue:`39952`, :issue:`40425`) - The method :meth:`.Styler.set_td_classes` is now as performant as :meth:`.Styler.apply` and :meth:`.Styler.applymap`, and even more so in some cases (:issue:`40453`) - Performance improvement in :meth:`.ExponentialMovingWindow.mean` with ``times`` (:issue:`39784`) -- Performance improvement in :meth:`.GroupBy.apply` when requiring the python fallback implementation (:issue:`40176`) +- Performance improvement in :meth:`.GroupBy.apply` when requiring the Python fallback implementation (:issue:`40176`) - Performance improvement in the conversion of a PyArrow Boolean array to a pandas nullable Boolean array (:issue:`41051`) - Performance improvement for concatenation of data with type :class:`CategoricalDtype` (:issue:`40193`) - Performance improvement in :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` with nullable data types (:issue:`37493`) @@ -905,6 +923,7 @@ Datetimelike - Bug in constructing a :class:`DataFrame` or :class:`Series` with mismatched ``datetime64`` data and ``timedelta64`` dtype, or vice-versa, failing to raise a ``TypeError`` (:issue:`38575`, :issue:`38764`, :issue:`38792`) - Bug in constructing a :class:`Series` or :class:`DataFrame` with a ``datetime`` object out of bounds for ``datetime64[ns]`` dtype or a ``timedelta`` object out of bounds for ``timedelta64[ns]`` dtype (:issue:`38792`, :issue:`38965`) - Bug in :meth:`DatetimeIndex.intersection`, :meth:`DatetimeIndex.symmetric_difference`, :meth:`PeriodIndex.intersection`, :meth:`PeriodIndex.symmetric_difference` always returning object-dtype when operating with :class:`CategoricalIndex` (:issue:`38741`) +- Bug in :meth:`DatetimeIndex.intersection` giving incorrect results with non-Tick frequencies with ``n != 1`` (:issue:`42104`) - Bug in :meth:`Series.where` incorrectly casting ``datetime64`` values to ``int64`` (:issue:`37682`) - Bug in :class:`Categorical` incorrectly typecasting ``datetime`` object to ``Timestamp`` (:issue:`38878`) - Bug in comparisons between :class:`Timestamp` object and ``datetime64`` objects just outside the implementation bounds for nanosecond ``datetime64`` (:issue:`39221`) @@ -912,6 +931,7 @@ Datetimelike - Bug in :meth:`Timedelta.round`, :meth:`Timedelta.floor`, :meth:`Timedelta.ceil` for values near the implementation bounds of :class:`Timedelta` (:issue:`38964`) - Bug in :func:`date_range` incorrectly creating :class:`DatetimeIndex` containing ``NaT`` instead of raising ``OutOfBoundsDatetime`` in corner cases (:issue:`24124`) - Bug in :func:`infer_freq` incorrectly fails to infer 'H' frequency of :class:`DatetimeIndex` if the latter has a timezone and crosses DST boundaries (:issue:`39556`) +- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`) Timedelta ^^^^^^^^^ @@ -941,6 +961,9 @@ Numeric - Bug in :meth:`Series.count` would result in an ``int32`` result on 32-bit platforms when argument ``level=None`` (:issue:`40908`) - Bug in :class:`Series` and :class:`DataFrame` reductions with methods ``any`` and ``all`` not returning Boolean results for object data (:issue:`12863`, :issue:`35450`, :issue:`27709`) - Bug in :meth:`Series.clip` would fail if the Series contains NA values and has nullable int or float as a data type (:issue:`40851`) +- Bug in :meth:`UInt64Index.where` and :meth:`UInt64Index.putmask` with an ``np.int64`` dtype ``other`` incorrectly raising ``TypeError`` (:issue:`41974`) +- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggregation functions when one or more aggregation function fails to produce results (:issue:`33634`) +- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`) Conversion ^^^^^^^^^^ @@ -956,6 +979,12 @@ Conversion - Bug in :class:`DataFrame` and :class:`Series` construction with ``datetime64[ns]`` data and ``dtype=object`` resulting in ``datetime`` objects instead of :class:`Timestamp` objects (:issue:`41599`) - Bug in :class:`DataFrame` and :class:`Series` construction with ``timedelta64[ns]`` data and ``dtype=object`` resulting in ``np.timedelta64`` objects instead of :class:`Timedelta` objects (:issue:`41599`) - Bug in :class:`DataFrame` construction when given a two-dimensional object-dtype ``np.ndarray`` of :class:`Period` or :class:`Interval` objects failing to cast to :class:`PeriodDtype` or :class:`IntervalDtype`, respectively (:issue:`41812`) +- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`) +- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`) +- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`) +- Bug in :func:`.infer_dtype` not recognizing Series, Index, or array with a Period dtype (:issue:`23553`) +- Bug in :func:`.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`) +- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised a ``ValueError`` when called on an empty DataFrame (:issue:`40393`) Strings ^^^^^^^ @@ -976,6 +1005,7 @@ Indexing ^^^^^^^^ - Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) +- Bug in :meth:`IntervalIndex.get_indexer` when ``target`` has ``CategoricalDtype`` and both the index and the target contain NA values (:issue:`41934`) - Bug in :meth:`Series.loc` raising a ``ValueError`` when input was filtered with a Boolean list and values to set were a list with lower dimension (:issue:`20438`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.__setitem__` raising a ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`) @@ -1007,12 +1037,17 @@ Indexing - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contained duplicates (:issue:`40096`) - Bug in :meth:`DataFrame.loc.__getitem__` with :class:`MultiIndex` casting to float when at least one index column has float dtype and we retrieve a scalar (:issue:`41369`) - Bug in :meth:`DataFrame.loc` incorrectly matching non-Boolean index elements (:issue:`20432`) +- Bug in indexing with ``np.nan`` on a :class:`Series` or :class:`DataFrame` with a :class:`CategoricalIndex` incorrectly raising ``KeyError`` when ``np.nan`` keys are present (:issue:`41933`) - Bug in :meth:`Series.__delitem__` with ``ExtensionDtype`` incorrectly casting to ``ndarray`` (:issue:`40386`) +- Bug in :meth:`DataFrame.at` with a :class:`CategoricalIndex` returning incorrect results when passed integer keys (:issue:`41846`) - Bug in :meth:`DataFrame.loc` returning a :class:`MultiIndex` in the wrong order if an indexer has duplicates (:issue:`40978`) - Bug in :meth:`DataFrame.__setitem__` raising a ``TypeError`` when using a ``str`` subclass as the column name with a :class:`DatetimeIndex` (:issue:`37366`) - Bug in :meth:`PeriodIndex.get_loc` failing to raise a ``KeyError`` when given a :class:`Period` with a mismatched ``freq`` (:issue:`41670`) - Bug ``.loc.__getitem__`` with a :class:`UInt64Index` and negative-integer keys raising ``OverflowError`` instead of ``KeyError`` in some cases, wrapping around to positive integers in others (:issue:`41777`) - Bug in :meth:`Index.get_indexer` failing to raise ``ValueError`` in some cases with invalid ``method``, ``limit``, or ``tolerance`` arguments (:issue:`41918`) +- Bug when slicing a :class:`Series` or :class:`DataFrame` with a :class:`TimedeltaIndex` when passing an invalid string raising ``ValueError`` instead of a ``TypeError`` (:issue:`41821`) +- Bug in :class:`Index` constructor sometimes silently ignoring a specified ``dtype`` (:issue:`38879`) +- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`) Missing ^^^^^^^ @@ -1021,6 +1056,7 @@ Missing - Bug in :meth:`DataFrame.fillna` not accepting a dictionary for the ``downcast`` keyword (:issue:`40809`) - Bug in :func:`isna` not returning a copy of the mask for nullable types, causing any subsequent mask modification to change the original array (:issue:`40935`) - Bug in :class:`DataFrame` construction with float data containing ``NaN`` and an integer ``dtype`` casting instead of retaining the ``NaN`` (:issue:`26919`) +- Bug in :meth:`Series.isin` and :meth:`MultiIndex.isin` didn't treat all nans as equivalent if they were in tuples (:issue:`41836`) MultiIndex ^^^^^^^^^^ @@ -1028,6 +1064,7 @@ MultiIndex - Bug in :meth:`MultiIndex.intersection` duplicating ``NaN`` in the result (:issue:`38623`) - Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when the :class:`MultiIndex` contained ``NaN`` even when they are differently ordered (:issue:`38439`) - Bug in :meth:`MultiIndex.intersection` always returning an empty result when intersecting with :class:`CategoricalIndex` (:issue:`38653`) +- Bug in :meth:`MultiIndex.difference` incorrectly raising ``TypeError`` when indexes contain non-sortable entries (:issue:`41915`) - Bug in :meth:`MultiIndex.reindex` raising a ``ValueError`` when used on an empty :class:`MultiIndex` and indexing only a specific level (:issue:`41170`) - Bug in :meth:`MultiIndex.reindex` raising ``TypeError`` when reindexing against a flat :class:`Index` (:issue:`41707`) @@ -1067,6 +1104,7 @@ I/O - Bug in the conversion from PyArrow to pandas (e.g. for reading Parquet) with nullable dtypes and a PyArrow array whose data buffer size is not a multiple of the dtype size (:issue:`40896`) - Bug in :func:`read_excel` would raise an error when pandas could not determine the file type even though the user specified the ``engine`` argument (:issue:`41225`) - Bug in :func:`read_clipboard` copying from an excel file shifts values into the wrong column if there are null values in first column (:issue:`41108`) +- Bug in :meth:`DataFrame.to_hdf` and :meth:`Series.to_hdf` raising a ``TypeError`` when trying to append a string column to an incompatible column (:issue:`41897`) Period ^^^^^^ @@ -1126,6 +1164,8 @@ Groupby/resample/rolling - Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`) - Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`) - Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`) +- Bug in :class:`DataFrameGroupBy` methods ``agg``, ``transform``, ``sum``, ``bfill``, ``ffill``, ``pad``, ``pct_change``, ``shift``, ``ohlc`` dropping ``.columns.names`` (:issue:`41497`) + Reshaping ^^^^^^^^^ @@ -1148,6 +1188,8 @@ Reshaping - Bug in :func:`to_datetime` raising an error when the input sequence contained unhashable items (:issue:`39756`) - Bug in :meth:`Series.explode` preserving the index when ``ignore_index`` was ``True`` and values were scalars (:issue:`40487`) - Bug in :func:`to_datetime` raising a ``ValueError`` when :class:`Series` contains ``None`` and ``NaT`` and has more than 50 elements (:issue:`39882`) +- Bug in :meth:`Series.unstack` and :meth:`DataFrame.unstack` with object-dtype values containing timezone-aware datetime objects incorrectly raising ``TypeError`` (:issue:`41875`) +- Bug in :meth:`DataFrame.melt` raising ``InvalidIndexError`` when :class:`DataFrame` has duplicate columns used as ``value_vars`` (:issue:`41951`) Sparse ^^^^^^ @@ -1175,24 +1217,14 @@ Styler Other ^^^^^ -- Bug in :class:`Index` constructor sometimes silently ignoring a specified ``dtype`` (:issue:`38879`) -- Bug in :func:`.infer_dtype` not recognizing Series, Index, or array with a Period dtype (:issue:`23553`) -- Bug in :func:`.infer_dtype` raising an error for general :class:`.ExtensionArray` objects. It will now return ``"unknown-array"`` instead of raising (:issue:`37367`) -- Bug in constructing a :class:`Series` from a list and a :class:`PandasDtype` (:issue:`39357`) - ``inspect.getmembers(Series)`` no longer raises an ``AbstractMethodError`` (:issue:`38782`) - Bug in :meth:`Series.where` with numeric dtype and ``other=None`` not casting to ``nan`` (:issue:`39761`) -- :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`) - Bug in :func:`.assert_series_equal`, :func:`.assert_frame_equal`, :func:`.assert_index_equal` and :func:`.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`) - Bug in :func:`.assert_index_equal` with ``exact=True`` not raising when comparing :class:`CategoricalIndex` instances with ``Int64Index`` and ``RangeIndex`` categories (:issue:`41263`) - Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, and :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`) - Bug in :func:`show_versions` where console JSON output was not proper JSON (:issue:`39701`) - pandas can now compile on z/OS when using `xlc `_ (:issue:`35826`) -- Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised a ``ValueError`` when called on an empty DataFrame (:issue:`40393`) -- Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggragation functions when one or more aggregation function fails to produce results (:issue:`33634`) -- Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`) -- Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`) -- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`) -- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`) +- Bug in :func:`pandas.util.hash_pandas_object` not recognizing ``hash_key``, ``encoding`` and ``categorize`` when the input object type is a :class:`DataFrame` (:issue:`41404`) .. --------------------------------------------------------------------------- @@ -1201,4 +1233,4 @@ Other Contributors ~~~~~~~~~~~~ -.. contributors:: v1.2.4..v1.3.0|HEAD +.. contributors:: v1.2.5..v1.3.0 diff --git a/doc/source/whatsnew/v1.3.1.rst b/doc/source/whatsnew/v1.3.1.rst new file mode 100644 index 0000000000000..a57995eb0db9a --- /dev/null +++ b/doc/source/whatsnew/v1.3.1.rst @@ -0,0 +1,51 @@ +.. _whatsnew_131: + +What's new in 1.3.1 (July 25, 2021) +----------------------------------- + +These are the changes in pandas 1.3.1. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_131.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Pandas could not be built on PyPy (:issue:`42355`) +- :class:`DataFrame` constructed with an older version of pandas could not be unpickled (:issue:`42345`) +- Performance regression in constructing a :class:`DataFrame` from a dictionary of dictionaries (:issue:`42248`) +- Fixed regression in :meth:`DataFrame.agg` dropping values when the DataFrame had an Extension Array dtype, a duplicate index, and ``axis=1`` (:issue:`42380`) +- Fixed regression in :meth:`DataFrame.astype` changing the order of noncontiguous data (:issue:`42396`) +- Performance regression in :class:`DataFrame` in reduction operations requiring casting such as :meth:`DataFrame.mean` on integer data (:issue:`38592`) +- Performance regression in :meth:`DataFrame.to_dict` and :meth:`Series.to_dict` when ``orient`` argument one of "records", "dict", or "split" (:issue:`42352`) +- Fixed regression in indexing with a ``list`` subclass incorrectly raising ``TypeError`` (:issue:`42433`, :issue:`42461`) +- Fixed regression in :meth:`DataFrame.isin` and :meth:`Series.isin` raising ``TypeError`` with nullable data containing at least one missing value (:issue:`42405`) +- Regression in :func:`concat` between objects with bool dtype and integer dtype casting to object instead of to integer (:issue:`42092`) +- Bug in :class:`Series` constructor not accepting a ``dask.Array`` (:issue:`38645`) +- Fixed regression for ``SettingWithCopyWarning`` displaying incorrect stacklevel (:issue:`42570`) +- Fixed regression for :func:`merge_asof` raising ``KeyError`` when one of the ``by`` columns is in the index (:issue:`34488`) +- Fixed regression in :func:`to_datetime` returning pd.NaT for inputs that produce duplicated values, when ``cache=True`` (:issue:`42259`) +- Fixed regression in :meth:`SeriesGroupBy.value_counts` that resulted in an ``IndexError`` when called on a Series with one row (:issue:`42618`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_131.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Fixed bug in :meth:`DataFrame.transpose` dropping values when the DataFrame had an Extension Array dtype and a duplicate index (:issue:`42380`) +- Fixed bug in :meth:`DataFrame.to_xml` raising ``KeyError`` when called with ``index=False`` and an offset index (:issue:`42458`) +- Fixed bug in :meth:`.Styler.set_sticky` not handling index names correctly for single index columns case (:issue:`42537`) +- Fixed bug in :meth:`DataFrame.copy` failing to consolidate blocks in the result (:issue:`42579`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_131.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.3.0..v1.3.1 diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst new file mode 100644 index 0000000000000..e3c6268547dd2 --- /dev/null +++ b/doc/source/whatsnew/v1.3.2.rst @@ -0,0 +1,51 @@ +.. _whatsnew_132: + +What's new in 1.3.2 (August 15, 2021) +------------------------------------- + +These are the changes in pandas 1.3.2. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_132.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Performance regression in :meth:`DataFrame.isin` and :meth:`Series.isin` for nullable data types (:issue:`42714`) +- Regression in updating values of :class:`Series` using boolean index, created by using :meth:`DataFrame.pop` (:issue:`42530`) +- Regression in :meth:`DataFrame.from_records` with empty records (:issue:`42456`) +- Fixed regression in :meth:`DataFrame.shift` where ``TypeError`` occurred when shifting DataFrame created by concatenation of slices and fills with values (:issue:`42719`) +- Regression in :meth:`DataFrame.agg` when the ``func`` argument returned lists and ``axis=1`` (:issue:`42727`) +- Regression in :meth:`DataFrame.drop` does nothing if :class:`MultiIndex` has duplicates and indexer is a tuple or list of tuples (:issue:`42771`) +- Fixed regression where :func:`read_csv` raised a ``ValueError`` when parameters ``names`` and ``prefix`` were both set to ``None`` (:issue:`42387`) +- Fixed regression in comparisons between :class:`Timestamp` object and ``datetime64`` objects outside the implementation bounds for nanosecond ``datetime64`` (:issue:`42794`) +- Fixed regression in :meth:`.Styler.highlight_min` and :meth:`.Styler.highlight_max` where ``pandas.NA`` was not successfully ignored (:issue:`42650`) +- Fixed regression in :func:`concat` where ``copy=False`` was not honored in ``axis=1`` Series concatenation (:issue:`42501`) +- Regression in :meth:`Series.nlargest` and :meth:`Series.nsmallest` with nullable integer or float dtype (:issue:`42816`) +- Fixed regression in :meth:`Series.quantile` with :class:`Int64Dtype` (:issue:`42626`) +- Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` where supplying the ``by`` argument with a Series named with a tuple would incorrectly raise (:issue:`42731`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_132.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :func:`read_excel` modifies the dtypes dictionary when reading a file with duplicate columns (:issue:`42462`) +- 1D slices over extension types turn into N-dimensional slices over ExtensionArrays (:issue:`42430`) +- Fixed bug in :meth:`Series.rolling` and :meth:`DataFrame.rolling` not calculating window bounds correctly for the first row when ``center=True`` and ``window`` is an offset that covers all the rows (:issue:`42753`) +- :meth:`.Styler.hide_columns` now hides the index name header row as well as column headers (:issue:`42101`) +- :meth:`.Styler.set_sticky` has amended CSS to control the column/index names and ensure the correct sticky positions (:issue:`42537`) +- Bug in de-serializing datetime indexes in PYTHONOPTIMIZED mode (:issue:`42866`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_132.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.3.1..v1.3.2 diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst new file mode 100644 index 0000000000000..9be673f16c6e0 --- /dev/null +++ b/doc/source/whatsnew/v1.3.3.rst @@ -0,0 +1,70 @@ +.. _whatsnew_133: + +What's new in 1.3.3 (September ??, 2021) +---------------------------------------- + +These are the changes in pandas 1.3.3. See :ref:`release` for a full changelog +including other versions of pandas. + +{{ header }} + +.. --------------------------------------------------------------------------- + +.. _whatsnew_133.regressions: + +Fixed regressions +~~~~~~~~~~~~~~~~~ +- Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) +- Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) +- Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) +- Fixed regression in :meth:`.GroupBy.quantile` which was failing with ``pandas.NA`` (:issue:`42849`) +- Fixed regression in :meth:`.GroupBy.apply` where ``nan`` values were dropped even with ``dropna=False`` (:issue:`43205`) +- Fixed regression in :meth:`merge` where ``on`` columns with ``ExtensionDtype`` or ``bool`` data types were cast to ``object`` in ``right`` and ``outer`` merge (:issue:`40073`) +- Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`) +- Fixed regression in :meth:`read_parquet` where the ``fastparquet`` engine would not work properly with fastparquet 0.7.0 (:issue:`43075`) +- Fixed regression in :meth:`DataFrame.loc.__setitem__` raising ``ValueError`` when setting array as cell value (:issue:`43422`) +- Fixed regression in :func:`is_list_like` where objects with ``__iter__`` set to ``None`` would be identified as iterable (:issue:`43373`) +- Fixed regression in :meth:`DataFrame.__getitem__` raising error for slice of :class:`DatetimeIndex` when index is non monotonic (:issue:`43223`) +- Fixed regression in :meth:`.Resampler.aggregate` when used after column selection would raise if ``func`` is a list of aggregation functions (:issue:`42905`) +- Fixed regression in :meth:`DataFrame.corr` where Kendall correlation would produce incorrect results for columns with repeated values (:issue:`43401`) +- Fixed regression in :meth:`DataFrame.groupby` where aggregation on columns with object types dropped results on those columns (:issue:`42395`, :issue:`43108`) +- Fixed regression in :meth:`Series.fillna` raising ``TypeError`` when filling ``float`` ``Series`` with list-like fill value having a dtype which couldn't cast lostlessly (like ``float32`` filled with ``float64``) (:issue:`43424`) +- Fixed regression in :func:`read_csv` throwing an ``AttributeError`` when the file handle is an ``tempfile.SpooledTemporaryFile`` object (:issue:`43439`) +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_133.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement for :meth:`DataFrame.__setitem__` when the key or value is not a :class:`DataFrame`, or key is not list-like (:issue:`43274`) +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_133.bug_fixes: + +Bug fixes +~~~~~~~~~ +- Bug in :meth:`.DataFrameGroupBy.agg` and :meth:`.DataFrameGroupBy.transform` with ``engine="numba"`` where ``index`` data was not being correctly passed into ``func`` (:issue:`43133`) +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_133.other: + +Other +~~~~~ +- +- + +.. --------------------------------------------------------------------------- + +.. _whatsnew_133.contributors: + +Contributors +~~~~~~~~~~~~ + +.. contributors:: v1.3.2..v1.3.3|HEAD diff --git a/environment.yml b/environment.yml index 788b88ef16ad6..13d022c3f0e72 100644 --- a/environment.yml +++ b/environment.yml @@ -106,10 +106,11 @@ dependencies: - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.5.1 # pandas.read_hdf, DataFrame.to_hdf - s3fs>=0.4.0 # file IO when using 's3://...' path - - fsspec>=0.7.4 # for generic remote file operations + - aiobotocore + - fsspec>=0.7.4, <2021.6.0 # for generic remote file operations - gcsfs>=0.6.0 # file IO when using 'gcs://...' path - sqlalchemy # pandas.read_sql, DataFrame.to_sql - - xarray # DataFrame.to_xarray + - xarray<0.19 # DataFrame.to_xarray - cftime # Needed for downstream xarray.CFTimeIndex test - pyreadstat # pandas.read_spss - tabulate>=0.8.3 # DataFrame.to_markdown diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 03f4ce273de6e..f619f09f85e66 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -481,100 +481,6 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr return result -# ---------------------------------------------------------------------- -# Kendall correlation -# Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient - -@cython.boundscheck(False) -@cython.wraparound(False) -def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray: - """ - Perform kendall correlation on a 2d array - - Parameters - ---------- - mat : np.ndarray[float64_t, ndim=2] - Array to compute kendall correlation on - minp : int, default 1 - Minimum number of observations required per pair of columns - to have a valid result. - - Returns - ------- - numpy.ndarray[float64_t, ndim=2] - Correlation matrix - """ - cdef: - Py_ssize_t i, j, k, xi, yi, N, K - ndarray[float64_t, ndim=2] result - ndarray[float64_t, ndim=2] ranked_mat - ndarray[uint8_t, ndim=2] mask - float64_t currj - ndarray[uint8_t, ndim=1] valid - ndarray[int64_t] sorted_idxs - ndarray[float64_t, ndim=1] col - int64_t n_concordant - int64_t total_concordant = 0 - int64_t total_discordant = 0 - float64_t kendall_tau - int64_t n_obs - const intp_t[:] labels_n - - N, K = (mat).shape - - result = np.empty((K, K), dtype=np.float64) - mask = np.isfinite(mat) - - ranked_mat = np.empty((N, K), dtype=np.float64) - # For compatibility when calling rank_1d - labels_n = np.zeros(N, dtype=np.intp) - - for i in range(K): - ranked_mat[:, i] = rank_1d(mat[:, i], labels_n) - - for xi in range(K): - sorted_idxs = ranked_mat[:, xi].argsort() - ranked_mat = ranked_mat[sorted_idxs] - mask = mask[sorted_idxs] - for yi in range(xi + 1, K): - valid = mask[:, xi] & mask[:, yi] - if valid.sum() < minp: - result[xi, yi] = NaN - result[yi, xi] = NaN - else: - # Get columns and order second column using 1st column ranks - if not valid.all(): - col = ranked_mat[valid.nonzero()][:, yi] - else: - col = ranked_mat[:, yi] - n_obs = col.shape[0] - total_concordant = 0 - total_discordant = 0 - for j in range(n_obs - 1): - currj = col[j] - # Count num concordant and discordant pairs - n_concordant = 0 - for k in range(j, n_obs): - if col[k] > currj: - n_concordant += 1 - total_concordant += n_concordant - total_discordant += (n_obs - 1 - j - n_concordant) - # Note: we do total_concordant+total_discordant here which is - # equivalent to the C(n, 2), the total # of pairs, - # listed on wikipedia - kendall_tau = (total_concordant - total_discordant) / \ - (total_concordant + total_discordant) - result[xi, yi] = kendall_tau - result[yi, xi] = kendall_tau - - if mask[:, xi].sum() > minp: - result[xi, xi] = 1 - else: - result[xi, xi] = NaN - - return result - - # ---------------------------------------------------------------------- ctypedef fused algos_t: @@ -1030,9 +936,9 @@ def rank_1d( if rank_t is object: nan_fill_val = Infinity() elif rank_t is int64_t: - nan_fill_val = np.iinfo(np.int64).max + nan_fill_val = util.INT64_MAX elif rank_t is uint64_t: - nan_fill_val = np.iinfo(np.uint64).max + nan_fill_val = util.UINT64_MAX else: nan_fill_val = np.inf order = (masked_vals, mask, labels) @@ -1393,7 +1299,7 @@ def rank_2d( # int64 and datetimelike else: - nan_value = np.iinfo(np.int64).max + nan_value = util.INT64_MAX else: if rank_t is object: diff --git a/pandas/_libs/algos_take_helper.pxi.in b/pandas/_libs/algos_take_helper.pxi.in index 11679fc432edc..96605fd2009fb 100644 --- a/pandas/_libs/algos_take_helper.pxi.in +++ b/pandas/_libs/algos_take_helper.pxi.in @@ -9,31 +9,6 @@ WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in # ---------------------------------------------------------------------- -@cython.wraparound(False) -@cython.boundscheck(False) -def take_1d_intp_intp( - const intp_t[:] values, - const intp_t[:] indexer, - intp_t[::1] out, - intp_t fill_value=-1, -): - cdef: - Py_ssize_t i, n, idx - intp_t fv - - n = indexer.shape[0] - - fv = fill_value - - with nogil: - for i in range(n): - idx = indexer[i] - if idx == -1: - out[i] = fv - else: - out[i] = values[idx] - - {{py: # c_type_in, c_type_out diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 5a1b98b190dbc..951703e04d5a3 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -228,3 +228,5 @@ def ismember( arr: np.ndarray, values: np.ndarray, ) -> np.ndarray: ... # np.ndarray[bool] +def object_hash(obj) -> int: ... +def objects_are_equal(a, b) -> bool: ... diff --git a/pandas/_libs/hashtable.pyx b/pandas/_libs/hashtable.pyx index 7df3f69337643..132435701bddb 100644 --- a/pandas/_libs/hashtable.pyx +++ b/pandas/_libs/hashtable.pyx @@ -34,6 +34,8 @@ from pandas._libs.khash cimport ( are_equivalent_khcomplex64_t, are_equivalent_khcomplex128_t, kh_needed_n_buckets, + kh_python_hash_equal, + kh_python_hash_func, kh_str_t, khcomplex64_t, khcomplex128_t, @@ -46,6 +48,14 @@ def get_hashtable_trace_domain(): return KHASH_TRACE_DOMAIN +def object_hash(obj): + return kh_python_hash_func(obj) + + +def objects_are_equal(a, b): + return kh_python_hash_equal(a, b) + + cdef int64_t NPY_NAT = util.get_nat() SIZE_HINT_LIMIT = (1 << 20) + 7 diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 6c1ca3deba047..cfd4695a5335b 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -569,7 +569,12 @@ cdef class BlockManager: public bint _known_consolidated, _is_consolidated public ndarray _blknos, _blklocs - def __cinit__(self, blocks, axes, verify_integrity=True): + def __cinit__(self, blocks=None, axes=None, verify_integrity=True): + # None as defaults for unpickling GH#42345 + if blocks is None: + # This adds 1-2 microseconds to DataFrame(np.array([])) + return + if isinstance(blocks, list): # Backward compat for e.g. pyarrow blocks = tuple(blocks) diff --git a/pandas/_libs/join.pyx b/pandas/_libs/join.pyx index b69b89c0de019..eefa16d23f576 100644 --- a/pandas/_libs/join.pyx +++ b/pandas/_libs/join.pyx @@ -20,27 +20,22 @@ from numpy cimport ( cnp.import_array() -from pandas._libs.algos import ( - groupsort_indexer, - take_1d_int64_int64, - take_1d_intp_intp, -) +from pandas._libs.algos import groupsort_indexer +@cython.wraparound(False) @cython.boundscheck(False) def inner_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[intp_t] left_sorter, right_sorter - ndarray[intp_t] left_count, right_count - ndarray[intp_t] left_indexer, right_indexer + intp_t[::1] left_sorter, right_sorter + intp_t[::1] left_count, right_count + intp_t[::1] left_indexer, right_indexer intp_t lc, rc - Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t left_pos = 0, right_pos = 0, position = 0 Py_ssize_t offset - # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) @@ -53,14 +48,13 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, if rc > 0 and lc > 0: count += lc * rc - # exclude the NA group - left_pos = left_count[0] - right_pos = right_count[0] - left_indexer = np.empty(count, dtype=np.intp) right_indexer = np.empty(count, dtype=np.intp) with nogil: + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] for i in range(1, max_groups + 1): lc = left_count[i] rc = right_count[i] @@ -75,24 +69,27 @@ def inner_join(const intp_t[:] left, const intp_t[:] right, left_pos += lc right_pos += rc - return (_get_result_indexer(left_sorter, left_indexer), - _get_result_indexer(right_sorter, right_indexer)) + # Will overwrite left/right indexer with the result + _get_result_indexer(left_sorter, left_indexer) + _get_result_indexer(right_sorter, right_indexer) + + return np.asarray(left_indexer), np.asarray(right_indexer) +@cython.wraparound(False) @cython.boundscheck(False) def left_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups, bint sort=True): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[intp_t] left_count, right_count - ndarray[intp_t] rev, left_sorter, right_sorter - ndarray[intp_t] left_indexer, right_indexer + ndarray[intp_t] rev + intp_t[::1] left_count, right_count + intp_t[::1] left_sorter, right_sorter + intp_t[::1] left_indexer, right_indexer intp_t lc, rc - Py_ssize_t loc, left_pos = 0, right_pos = 0, position = 0 + Py_ssize_t left_pos = 0, right_pos = 0, position = 0 Py_ssize_t offset - # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) @@ -104,14 +101,13 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, else: count += left_count[i] - # exclude the NA group - left_pos = left_count[0] - right_pos = right_count[0] - left_indexer = np.empty(count, dtype=np.intp) right_indexer = np.empty(count, dtype=np.intp) with nogil: + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] for i in range(1, max_groups + 1): lc = left_count[i] rc = right_count[i] @@ -131,40 +127,38 @@ def left_outer_join(const intp_t[:] left, const intp_t[:] right, left_pos += lc right_pos += rc - left_indexer = _get_result_indexer(left_sorter, left_indexer) - right_indexer = _get_result_indexer(right_sorter, right_indexer) + # Will overwrite left/right indexer with the result + _get_result_indexer(left_sorter, left_indexer) + _get_result_indexer(right_sorter, right_indexer) if not sort: # if not asked to sort, revert to original order - # cast to avoid build warning GH#26757 - if len(left) == len(left_indexer): + if len(left) == len(left_indexer): # no multiple matches for any row on the left # this is a short-cut to avoid groupsort_indexer # otherwise, the `else` path also works in this case rev = np.empty(len(left), dtype=np.intp) - rev.put(left_sorter, np.arange(len(left))) + rev.put(np.asarray(left_sorter), np.arange(len(left))) else: rev, _ = groupsort_indexer(left_indexer, len(left)) - right_indexer = right_indexer.take(rev) - left_indexer = left_indexer.take(rev) - - return left_indexer, right_indexer + return np.asarray(left_indexer).take(rev), np.asarray(right_indexer).take(rev) + else: + return np.asarray(left_indexer), np.asarray(right_indexer) +@cython.wraparound(False) @cython.boundscheck(False) def full_outer_join(const intp_t[:] left, const intp_t[:] right, Py_ssize_t max_groups): cdef: Py_ssize_t i, j, k, count = 0 - ndarray[intp_t] left_sorter, right_sorter - ndarray[intp_t] left_count, right_count - ndarray[intp_t] left_indexer, right_indexer + intp_t[::1] left_sorter, right_sorter + intp_t[::1] left_count, right_count + intp_t[::1] left_indexer, right_indexer intp_t lc, rc intp_t left_pos = 0, right_pos = 0 Py_ssize_t offset, position = 0 - # NA group in location 0 - left_sorter, left_count = groupsort_indexer(left, max_groups) right_sorter, right_count = groupsort_indexer(right, max_groups) @@ -179,14 +173,13 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, else: count += lc + rc - # exclude the NA group - left_pos = left_count[0] - right_pos = right_count[0] - left_indexer = np.empty(count, dtype=np.intp) right_indexer = np.empty(count, dtype=np.intp) with nogil: + # exclude the NA group + left_pos = left_count[0] + right_pos = right_count[0] for i in range(1, max_groups + 1): lc = left_count[i] rc = right_count[i] @@ -211,24 +204,33 @@ def full_outer_join(const intp_t[:] left, const intp_t[:] right, left_pos += lc right_pos += rc - return (_get_result_indexer(left_sorter, left_indexer), - _get_result_indexer(right_sorter, right_indexer)) + # Will overwrite left/right indexer with the result + _get_result_indexer(left_sorter, left_indexer) + _get_result_indexer(right_sorter, right_indexer) + + return np.asarray(left_indexer), np.asarray(right_indexer) -cdef ndarray[intp_t] _get_result_indexer( - ndarray[intp_t] sorter, ndarray[intp_t] indexer -): +@cython.wraparound(False) +@cython.boundscheck(False) +cdef void _get_result_indexer(intp_t[::1] sorter, intp_t[::1] indexer) nogil: + """NOTE: overwrites indexer with the result to avoid allocating another array""" + cdef: + Py_ssize_t i, n, idx + if len(sorter) > 0: # cython-only equivalent to # `res = algos.take_nd(sorter, indexer, fill_value=-1)` - res = np.empty(len(indexer), dtype=np.intp) - take_1d_intp_intp(sorter, indexer, res, -1) + n = indexer.shape[0] + for i in range(n): + idx = indexer[i] + if idx == -1: + indexer[i] = -1 + else: + indexer[i] = sorter[idx] else: # length-0 case - res = np.empty(len(indexer), dtype=np.intp) - res[:] = -1 - - return res + indexer[:] = -1 def ffill_indexer(const intp_t[:] indexer) -> np.ndarray: diff --git a/pandas/_libs/khash.pxd b/pandas/_libs/khash.pxd index ba805e9ff1251..b9c18d6c86039 100644 --- a/pandas/_libs/khash.pxd +++ b/pandas/_libs/khash.pxd @@ -41,6 +41,9 @@ cdef extern from "khash_python.h": bint are_equivalent_float32_t \ "kh_floats_hash_equal" (float32_t a, float32_t b) nogil + uint32_t kh_python_hash_func(object key) + bint kh_python_hash_equal(object a, object b) + ctypedef struct kh_pymap_t: khuint_t n_buckets, size, n_occupied, upper_bound uint32_t *flags diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 3f4623638c70e..80b5954fa0d91 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -25,6 +25,9 @@ class NoDefault(Enum): ... no_default: NoDefault +i8max: int +u8max: int + def item_from_zerodim(val: object) -> object: ... def infer_dtype(value: object, skipna: bool = True) -> str: ... def is_iterator(obj: object) -> bool: ... @@ -48,6 +51,7 @@ def is_string_array(values: np.ndarray, skipna: bool = False): ... def is_float_array(values: np.ndarray, skipna: bool = False): ... def is_integer_array(values: np.ndarray, skipna: bool = False): ... def is_bool_array(values: np.ndarray, skipna: bool = False): ... +def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> np.ndarray: ... def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ... def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ... def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0aec7e5e5a363..748d16923ffb6 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -118,6 +118,10 @@ cdef: float64_t NaN = np.NaN +# python-visible +i8max = INT64_MAX +u8max = UINT64_MAX + @cython.wraparound(False) @cython.boundscheck(False) @@ -712,6 +716,14 @@ cpdef ndarray[object] ensure_string_array( Py_ssize_t i = 0, n = len(arr) if hasattr(arr, "to_numpy"): + + if hasattr(arr, "dtype") and arr.dtype.kind in ["m", "M"]: + # dtype check to exclude DataFrame + # GH#41409 TODO: not a great place for this + out = arr.astype(str).astype(object) + out[arr.isna()] = na_value + return out + arr = arr.to_numpy() elif not isinstance(arr, np.ndarray): arr = np.array(arr, dtype="object") @@ -894,12 +906,13 @@ def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask, return counts +@cython.wraparound(False) +@cython.boundscheck(False) def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups): cdef: Py_ssize_t i, group_size, n, start intp_t lab - object slobj - ndarray[int64_t] starts, ends + int64_t[::1] starts, ends n = len(labels) @@ -908,19 +921,20 @@ def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups): start = 0 group_size = 0 - for i in range(n): - lab = labels[i] - if lab < 0: - start += 1 - else: - group_size += 1 - if i == n - 1 or lab != labels[i + 1]: - starts[lab] = start - ends[lab] = start + group_size - start += group_size - group_size = 0 + with nogil: + for i in range(n): + lab = labels[i] + if lab < 0: + start += 1 + else: + group_size += 1 + if i == n - 1 or lab != labels[i + 1]: + starts[lab] = start + ends[lab] = start + group_size + start += group_size + group_size = 0 - return starts, ends + return np.asarray(starts), np.asarray(ends) def indices_fast(ndarray[intp_t] index, const int64_t[:] labels, list keys, @@ -1091,7 +1105,7 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1: return ( # equiv: `isinstance(obj, abc.Iterable)` - hasattr(obj, "__iter__") and not isinstance(obj, type) + getattr(obj, "__iter__", None) is not None and not isinstance(obj, type) # we do not count strings/unicode/bytes as list-like and not isinstance(obj, (str, bytes)) # exclude zero-dimensional numpy arrays, effectively scalars @@ -2965,6 +2979,28 @@ def to_object_array_tuples(rows: object) -> np.ndarray: return result +@cython.wraparound(False) +@cython.boundscheck(False) +def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray: + cdef: + Py_ssize_t i, n = len(keys) + object val + ndarray[object] output = np.empty(n, dtype='O') + + if n == 0: + # kludge, for Series + return np.empty(0, dtype='f8') + + for i in range(n): + val = keys[i] + if val in mapping: + output[i] = mapping[val] + else: + output[i] = default + + return maybe_convert_objects(output) + + def is_bool_list(obj: list) -> bool: """ Check if this list contains only bool or np.bool_ objects. diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index 7d7074988e5f0..e5e61e409c320 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -108,6 +108,7 @@ from pandas.core.dtypes.common import ( is_object_dtype, ) from pandas.core.dtypes.dtypes import CategoricalDtype +from pandas.core.dtypes.inference import is_dict_like cdef: float64_t INF = np.inf @@ -689,6 +690,7 @@ cdef class TextReader: count = counts.get(name, 0) if ( self.dtype is not None + and is_dict_like(self.dtype) and self.dtype.get(old_name) is not None and self.dtype.get(name) is None ): diff --git a/pandas/_libs/src/klib/khash_python.h b/pandas/_libs/src/klib/khash_python.h index aee018262e3a6..c0fca76ef701e 100644 --- a/pandas/_libs/src/klib/khash_python.h +++ b/pandas/_libs/src/klib/khash_python.h @@ -163,28 +163,198 @@ KHASH_MAP_INIT_COMPLEX128(complex128, size_t) #define kh_exist_complex128(h, k) (kh_exist(h, k)) +// NaN-floats should be in the same equivalency class, see GH 22119 +int PANDAS_INLINE floatobject_cmp(PyFloatObject* a, PyFloatObject* b){ + return ( + Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && + Py_IS_NAN(PyFloat_AS_DOUBLE(b)) + ) + || + ( PyFloat_AS_DOUBLE(a) == PyFloat_AS_DOUBLE(b) ); +} + + +// NaNs should be in the same equivalency class, see GH 41836 +// PyObject_RichCompareBool for complexobjects has a different behavior +// needs to be replaced +int PANDAS_INLINE complexobject_cmp(PyComplexObject* a, PyComplexObject* b){ + return ( + Py_IS_NAN(a->cval.real) && + Py_IS_NAN(b->cval.real) && + Py_IS_NAN(a->cval.imag) && + Py_IS_NAN(b->cval.imag) + ) + || + ( + Py_IS_NAN(a->cval.real) && + Py_IS_NAN(b->cval.real) && + a->cval.imag == b->cval.imag + ) + || + ( + a->cval.real == b->cval.real && + Py_IS_NAN(a->cval.imag) && + Py_IS_NAN(b->cval.imag) + ) + || + ( + a->cval.real == b->cval.real && + a->cval.imag == b->cval.imag + ); +} + +int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b); + + +// replacing PyObject_RichCompareBool (NaN!=NaN) with pyobject_cmp (NaN==NaN), +// which treats NaNs as equivalent +// see GH 41836 +int PANDAS_INLINE tupleobject_cmp(PyTupleObject* a, PyTupleObject* b){ + Py_ssize_t i; + + if (Py_SIZE(a) != Py_SIZE(b)) { + return 0; + } + + for (i = 0; i < Py_SIZE(a); ++i) { + if (!pyobject_cmp(PyTuple_GET_ITEM(a, i), PyTuple_GET_ITEM(b, i))) { + return 0; + } + } + return 1; +} + + int PANDAS_INLINE pyobject_cmp(PyObject* a, PyObject* b) { + if (a == b) { + return 1; + } + if (Py_TYPE(a) == Py_TYPE(b)) { + // special handling for some built-in types which could have NaNs + // as we would like to have them equivalent, but the usual + // PyObject_RichCompareBool would return False + if (PyFloat_CheckExact(a)) { + return floatobject_cmp((PyFloatObject*)a, (PyFloatObject*)b); + } + if (PyComplex_CheckExact(a)) { + return complexobject_cmp((PyComplexObject*)a, (PyComplexObject*)b); + } + if (PyTuple_CheckExact(a)) { + return tupleobject_cmp((PyTupleObject*)a, (PyTupleObject*)b); + } + // frozenset isn't yet supported + } + int result = PyObject_RichCompareBool(a, b, Py_EQ); if (result < 0) { PyErr_Clear(); return 0; } - if (result == 0) { // still could be two NaNs - return PyFloat_CheckExact(a) && - PyFloat_CheckExact(b) && - Py_IS_NAN(PyFloat_AS_DOUBLE(a)) && - Py_IS_NAN(PyFloat_AS_DOUBLE(b)); - } return result; } -khint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key){ +Py_hash_t PANDAS_INLINE _Pandas_HashDouble(double val) { + //Since Python3.10, nan is no longer has hash 0 + if (Py_IS_NAN(val)) { + return 0; + } +#if PY_VERSION_HEX < 0x030A0000 + return _Py_HashDouble(val); +#else + return _Py_HashDouble(NULL, val); +#endif +} + + +Py_hash_t PANDAS_INLINE floatobject_hash(PyFloatObject* key) { + return _Pandas_HashDouble(PyFloat_AS_DOUBLE(key)); +} + + +#define _PandasHASH_IMAG 1000003UL + +// replaces _Py_HashDouble with _Pandas_HashDouble +Py_hash_t PANDAS_INLINE complexobject_hash(PyComplexObject* key) { + Py_uhash_t realhash = (Py_uhash_t)_Pandas_HashDouble(key->cval.real); + Py_uhash_t imaghash = (Py_uhash_t)_Pandas_HashDouble(key->cval.imag); + if (realhash == (Py_uhash_t)-1 || imaghash == (Py_uhash_t)-1) { + return -1; + } + Py_uhash_t combined = realhash + _PandasHASH_IMAG * imaghash; + if (combined == (Py_uhash_t)-1) { + return -2; + } + return (Py_hash_t)combined; +} + + +khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key); + +//we could use any hashing algorithm, this is the original CPython's for tuples + +#if SIZEOF_PY_UHASH_T > 4 +#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)11400714785074694791ULL) +#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)14029467366897019727ULL) +#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)2870177450012600261ULL) +#define _PandasHASH_XXROTATE(x) ((x << 31) | (x >> 33)) /* Rotate left 31 bits */ +#else +#define _PandasHASH_XXPRIME_1 ((Py_uhash_t)2654435761UL) +#define _PandasHASH_XXPRIME_2 ((Py_uhash_t)2246822519UL) +#define _PandasHASH_XXPRIME_5 ((Py_uhash_t)374761393UL) +#define _PandasHASH_XXROTATE(x) ((x << 13) | (x >> 19)) /* Rotate left 13 bits */ +#endif + +Py_hash_t PANDAS_INLINE tupleobject_hash(PyTupleObject* key) { + Py_ssize_t i, len = Py_SIZE(key); + PyObject **item = key->ob_item; + + Py_uhash_t acc = _PandasHASH_XXPRIME_5; + for (i = 0; i < len; i++) { + Py_uhash_t lane = kh_python_hash_func(item[i]); + if (lane == (Py_uhash_t)-1) { + return -1; + } + acc += lane * _PandasHASH_XXPRIME_2; + acc = _PandasHASH_XXROTATE(acc); + acc *= _PandasHASH_XXPRIME_1; + } + + /* Add input length, mangled to keep the historical value of hash(()). */ + acc += len ^ (_PandasHASH_XXPRIME_5 ^ 3527539UL); + + if (acc == (Py_uhash_t)-1) { + return 1546275796; + } + return acc; +} + + +khuint32_t PANDAS_INLINE kh_python_hash_func(PyObject* key) { + Py_hash_t hash; // For PyObject_Hash holds: // hash(0.0) == 0 == hash(-0.0) - // hash(X) == 0 if X is a NaN-value - // so it is OK to use it directly for doubles - Py_hash_t hash = PyObject_Hash(key); + // yet for different nan-objects different hash-values + // are possible + if (PyFloat_CheckExact(key)) { + // we cannot use kh_float64_hash_func + // becase float(k) == k holds for any int-object k + // and kh_float64_hash_func doesn't respect it + hash = floatobject_hash((PyFloatObject*)key); + } + else if (PyComplex_CheckExact(key)) { + // we cannot use kh_complex128_hash_func + // becase complex(k,0) == k holds for any int-object k + // and kh_complex128_hash_func doesn't respect it + hash = complexobject_hash((PyComplexObject*)key); + } + else if (PyTuple_CheckExact(key)) { + hash = tupleobject_hash((PyTupleObject*)key); + } + else { + hash = PyObject_Hash(key); + } + if (hash == -1) { PyErr_Clear(); return 0; diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index edd3b58867e87..911cf07862acc 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -129,6 +129,13 @@ cdef inline object create_timestamp_from_ts(int64_t value, return ts_base +def _unpickle_timestamp(value, freq, tz): + # GH#41949 dont warn on unpickle if we have a freq + ts = Timestamp(value, tz=tz) + ts._set_freq(freq) + return ts + + # ---------------------------------------------------------------------- def integer_op_not_supported(obj): @@ -263,9 +270,9 @@ cdef class _Timestamp(ABCTimestamp): if op == Py_EQ: return False if op == Py_LE or op == Py_LT: - return other.year <= self.year + return self.year <= other.year if op == Py_GE or op == Py_GT: - return other.year >= self.year + return self.year >= other.year cdef bint _can_compare(self, datetime other): if self.tzinfo is not None: @@ -725,7 +732,7 @@ cdef class _Timestamp(ABCTimestamp): def __reduce__(self): object_state = self.value, self._freq, self.tzinfo - return (Timestamp, object_state) + return (_unpickle_timestamp, object_state) # ----------------------------------------------------------------- # Rendering Methods diff --git a/pandas/_libs/window/aggregations.pyx b/pandas/_libs/window/aggregations.pyx index 3d3a19a1c7a40..377fec76b8ccc 100644 --- a/pandas/_libs/window/aggregations.pyx +++ b/pandas/_libs/window/aggregations.pyx @@ -1486,7 +1486,7 @@ def roll_weighted_var(const float64_t[:] values, const float64_t[:] weights, def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, int minp, float64_t com, bint adjust, bint ignore_na, - const float64_t[:] deltas) -> np.ndarray: + const float64_t[:] deltas=None) -> np.ndarray: """ Compute exponentially-weighted moving average using center-of-mass. @@ -1499,7 +1499,8 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, com : float64 adjust : bool ignore_na : bool - deltas : ndarray (float64 type) + deltas : ndarray (float64 type), optional. If None, implicitly assumes equally + spaced points (used when `times` is not passed) Returns ------- @@ -1508,14 +1509,17 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, cdef: Py_ssize_t i, j, s, e, nobs, win_size, N = len(vals), M = len(start) - const float64_t[:] sub_deltas, sub_vals + const float64_t[:] sub_vals + const float64_t[:] sub_deltas=None ndarray[float64_t] sub_output, output = np.empty(N, dtype=np.float64) float64_t alpha, old_wt_factor, new_wt, weighted_avg, old_wt, cur - bint is_observation + bint is_observation, use_deltas if N == 0: return output + use_deltas = deltas is not None + alpha = 1. / (1. + com) old_wt_factor = 1. - alpha new_wt = 1. if adjust else alpha @@ -1526,7 +1530,8 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, sub_vals = vals[s:e] # note that len(deltas) = len(vals) - 1 and deltas[i] is to be used in # conjunction with vals[i+1] - sub_deltas = deltas[s:e - 1] + if use_deltas: + sub_deltas = deltas[s:e - 1] win_size = len(sub_vals) sub_output = np.empty(win_size, dtype=np.float64) @@ -1544,7 +1549,10 @@ def ewma(const float64_t[:] vals, const int64_t[:] start, const int64_t[:] end, if weighted_avg == weighted_avg: if is_observation or not ignore_na: - old_wt *= old_wt_factor ** sub_deltas[i - 1] + if use_deltas: + old_wt *= old_wt_factor ** sub_deltas[i - 1] + else: + old_wt *= old_wt_factor if is_observation: # avoid numerical errors on constant series diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index d188770576e05..197345b3ce6ac 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -79,12 +79,11 @@ def calculate_variable_window_bounds( else: end[0] = 0 if center: - for j in range(0, num_values + 1): - if (index[j] == index[0] + index_growth_sign * window_size / 2 and - right_closed): + end_bound = index[0] + index_growth_sign * window_size / 2 + for j in range(0, num_values): + if (index[j] < end_bound) or (index[j] == end_bound and right_closed): end[0] = j + 1 - break - elif index[j] >= index[0] + index_growth_sign * window_size / 2: + elif index[j] >= end_bound: end[0] = j break diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f26cf113f7d5e..3ab0350f23c5a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -11,11 +11,7 @@ Union, cast, ) -from warnings import ( - catch_warnings, - simplefilter, - warn, -) +from warnings import warn import numpy as np @@ -31,6 +27,7 @@ DtypeObj, FrameOrSeriesUnion, Scalar, + final, ) from pandas.util._decorators import doc @@ -140,7 +137,11 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]: return np.asarray(values).view("uint8"), values.dtype else: # i.e. all-bool Categorical, BooleanArray - return np.asarray(values).astype("uint8", copy=False), values.dtype + try: + return np.asarray(values).astype("uint8", copy=False), values.dtype + except TypeError: + # GH#42107 we have pd.NAs present + return np.asarray(values), values.dtype elif is_integer_dtype(values.dtype): return np.asarray(values), values.dtype @@ -155,12 +156,10 @@ def _ensure_data(values: ArrayLike) -> tuple[np.ndarray, DtypeObj]: return np.asarray(values), values.dtype elif is_complex_dtype(values.dtype): - # ignore the fact that we are casting to float - # which discards complex parts - with catch_warnings(): - simplefilter("ignore", np.ComplexWarning) - values = ensure_float64(values) - return values, np.dtype("float64") + # Incompatible return value type (got "Tuple[Union[Any, ExtensionArray, + # ndarray[Any, Any]], Union[Any, ExtensionDtype]]", expected + # "Tuple[ndarray[Any, Any], Union[dtype[Any], ExtensionDtype]]") + return values, values.dtype # type: ignore[return-value] # datetimelike elif needs_i8_conversion(values.dtype): @@ -242,6 +241,8 @@ def _ensure_arraylike(values) -> ArrayLike: _hashtables = { + "complex128": htable.Complex128HashTable, + "complex64": htable.Complex64HashTable, "float64": htable.Float64HashTable, "float32": htable.Float32HashTable, "uint64": htable.UInt64HashTable, @@ -1092,18 +1093,19 @@ def checked_add_with_arr( # it is negative, we then check whether its sum with the element in # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow # error as well. + i8max = lib.i8max + i8min = iNaT + mask1 = b2 > 0 mask2 = b2 < 0 if not mask1.any(): - to_raise = ((np.iinfo(np.int64).min - b2 > arr) & not_nan).any() + to_raise = ((i8min - b2 > arr) & not_nan).any() elif not mask2.any(): - to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() + to_raise = ((i8max - b2 < arr) & not_nan).any() else: - to_raise = ( - (np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1] - ).any() or ( - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or ( + (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2] ).any() if to_raise: @@ -1214,12 +1216,15 @@ def __init__(self, obj, n: int, keep: str): def compute(self, method: str) -> FrameOrSeriesUnion: raise NotImplementedError + @final def nlargest(self): return self.compute("nlargest") + @final def nsmallest(self): return self.compute("nsmallest") + @final @staticmethod def is_valid_dtype_n_method(dtype: DtypeObj) -> bool: """ @@ -1258,6 +1263,18 @@ def compute(self, method: str) -> Series: dropped = self.obj.dropna() + if is_extension_array_dtype(dropped.dtype): + # GH#41816 bc we have dropped NAs above, MaskedArrays can use the + # numpy logic. + from pandas.core.arrays import BaseMaskedArray + + arr = dropped._values + if isinstance(arr, BaseMaskedArray): + ser = type(dropped)(arr._data, index=dropped.index, name=dropped.name) + + result = type(self)(ser, n=self.n, keep=self.keep).compute(method) + return result.astype(arr.dtype) + # slow method if n >= len(self.obj): ascending = method == "nsmallest" diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 388c1881afed7..aae3262893da2 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -348,6 +348,7 @@ def agg_list_like(self) -> FrameOrSeriesUnion: # multiples else: + indices = [] for index, col in enumerate(selected_obj): colg = obj._gotitem(col, ndim=1, subset=selected_obj.iloc[:, index]) try: @@ -369,7 +370,9 @@ def agg_list_like(self) -> FrameOrSeriesUnion: raise else: results.append(new_res) - keys.append(col) + indices.append(index) + + keys = selected_obj.columns.take(indices) # if we are empty if not len(results): @@ -407,6 +410,7 @@ def agg_dict_like(self) -> FrameOrSeriesUnion: ------- Result of aggregation. """ + from pandas import Index from pandas.core.reshape.concat import concat obj = self.obj @@ -443,8 +447,18 @@ def agg_dict_like(self) -> FrameOrSeriesUnion: keys_to_use = [k for k in keys if not results[k].empty] # Have to check, if at least one DataFrame is not empty. keys_to_use = keys_to_use if keys_to_use != [] else keys + if selected_obj.ndim == 2: + # keys are columns, so we can preserve names + ktu = Index(keys_to_use) + ktu._set_names(selected_obj.columns.names) + # Incompatible types in assignment (expression has type "Index", + # variable has type "List[Hashable]") + keys_to_use = ktu # type: ignore[assignment] + axis = 0 if isinstance(obj, ABCSeries) else 1 - result = concat({k: results[k] for k in keys_to_use}, axis=axis) + result = concat( + {k: results[k] for k in keys_to_use}, axis=axis, keys=keys_to_use + ) elif any(is_ndframe): # There is a mix of NDFrames and scalars raise ValueError( @@ -677,21 +691,28 @@ def agg(self): obj = self.obj axis = self.axis + # TODO: Avoid having to change state + self.obj = self.obj if self.axis == 0 else self.obj.T + self.axis = 0 + + result = None + try: + result = super().agg() + except TypeError as err: + exc = TypeError( + "DataFrame constructor called with " + f"incompatible data and dtype: {err}" + ) + raise exc from err + finally: + self.obj = obj + self.axis = axis + if axis == 1: - result = FrameRowApply( - obj.T, - self.orig_f, - self.raw, - self.result_type, - self.args, - self.kwargs, - ).agg() result = result.T if result is not None else result - else: - result = super().agg() if result is None: - result = obj.apply(self.orig_f, axis, args=self.args, **self.kwargs) + result = self.obj.apply(self.orig_f, axis, args=self.args, **self.kwargs) return result @@ -842,7 +863,7 @@ def apply_str(self) -> FrameOrSeriesUnion: # Special-cased because DataFrame.size returns a single scalar obj = self.obj value = obj.shape[self.axis] - return obj._constructor_sliced(value, index=self.agg_axis, name="size") + return obj._constructor_sliced(value, index=self.agg_axis) return super().apply_str() @@ -1062,11 +1083,7 @@ def apply_standard(self) -> FrameOrSeriesUnion: with np.errstate(all="ignore"): if isinstance(f, np.ufunc): - # error: Argument 1 to "__call__" of "ufunc" has incompatible type - # "Series"; expected "Union[Union[int, float, complex, str, bytes, - # generic], Sequence[Union[int, float, complex, str, bytes, generic]], - # Sequence[Sequence[Any]], _SupportsArray]" - return f(obj) # type: ignore[arg-type] + return f(obj) # row-wise access if is_extension_array_dtype(obj.dtype) and hasattr(obj._values, "map"): diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 3a67f7d871f86..a8f69497d4019 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -41,8 +41,14 @@ def putmask_inplace(values: ArrayLike, mask: np.ndarray, value: Any) -> None: if lib.is_scalar(value) and isinstance(values, np.ndarray): value = convert_scalar_for_putitemlike(value, values.dtype) - if not isinstance(values, np.ndarray) or ( - values.dtype == object and not lib.is_scalar(value) + if ( + not isinstance(values, np.ndarray) + or (values.dtype == object and not lib.is_scalar(value)) + # GH#43424: np.putmask raises TypeError if we cannot cast between types with + # rule = "safe", a stricter guarantee we may not have here + or ( + isinstance(value, np.ndarray) and not np.can_cast(value.dtype, values.dtype) + ) ): # GH#19266 using np.putmask gives unexpected results with listlike value if is_list_like(value) and len(value) == len(values): diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index 32c50ed38eba0..c5e96f32e261f 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -181,5 +181,10 @@ def _quantile_ea_fallback( assert res.ndim == 2 assert res.shape[0] == 1 res = res[0] - out = type(values)._from_sequence(res, dtype=values.dtype) + try: + out = type(values)._from_sequence(res, dtype=values.dtype) + except TypeError: + # GH#42626: not able to safely cast Int64 + # for floating point output + out = np.atleast_2d(np.asarray(res, dtype=np.float64)) return out diff --git a/pandas/core/arrays/_ranges.py b/pandas/core/arrays/_ranges.py index cac9fcd40fa52..3909875e5660a 100644 --- a/pandas/core/arrays/_ranges.py +++ b/pandas/core/arrays/_ranges.py @@ -6,6 +6,7 @@ import numpy as np +from pandas._libs.lib import i8max from pandas._libs.tslibs import ( BaseOffset, OutOfBoundsDatetime, @@ -103,7 +104,7 @@ def _generate_range_overflow_safe( # GH#14187 raise instead of incorrectly wrapping around assert side in ["start", "end"] - i64max = np.uint64(np.iinfo(np.int64).max) + i64max = np.uint64(i8max) msg = f"Cannot generate range with {side}={endpoint} and periods={periods}" with np.errstate(over="raise"): @@ -180,7 +181,7 @@ def _generate_range_overflow_safe_signed( # error: Incompatible types in assignment (expression has type # "unsignedinteger[_64Bit]", variable has type "signedinteger[_64Bit]") result = np.uint64(endpoint) + np.uint64(addend) # type: ignore[assignment] - i64max = np.uint64(np.iinfo(np.int64).max) + i64max = np.uint64(i8max) assert result > i64max if result <= i64max + np.uint64(stride): # error: Incompatible return value type (got "unsignedinteger", expected diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 888c7cbbffb59..96bd4280f4da4 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -1296,8 +1296,10 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): """ raise TypeError(f"cannot perform {name} with type {self.dtype}") - def __hash__(self) -> int: - raise TypeError(f"unhashable type: {repr(type(self).__name__)}") + # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 + # Incompatible types in assignment (expression has type "None", base class + # "object" defined the type as "Callable[[object], int]") + __hash__: None # type: ignore[assignment] # ------------------------------------------------------------------------ # Non-Optimized Default Methods diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index ecc45357db8c1..3fdb52a73dc3e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -26,6 +26,7 @@ NaT, algos as libalgos, hashtable as htable, + lib, ) from pandas._libs.arrays import NDArrayBacked from pandas._libs.lib import no_default @@ -523,6 +524,7 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: try: new_cats = np.asarray(self.categories) new_cats = new_cats.astype(dtype=dtype, copy=copy) + fill_value = lib.item_from_zerodim(np.array(np.nan).astype(dtype)) except ( TypeError, # downstream error msg for CategoricalIndex is misleading ValueError, @@ -530,7 +532,9 @@ def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}" raise ValueError(msg) - result = take_nd(new_cats, ensure_platform_int(self._codes)) + result = take_nd( + new_cats, ensure_platform_int(self._codes), fill_value=fill_value + ) return result diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 2318cae004c5a..dd45029336f63 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -64,6 +64,7 @@ from pandas.core.algorithms import ( isin, take, + unique, value_counts, ) from pandas.core.arrays.base import ( @@ -1610,6 +1611,29 @@ def _combined(self) -> ArrayLike: comb = np.concatenate([left, right], axis=1) return comb + def _from_combined(self, combined: np.ndarray) -> IntervalArray: + """ + Create a new IntervalArray with our dtype from a 1D complex128 ndarray. + """ + nc = combined.view("i8").reshape(-1, 2) + + dtype = self._left.dtype + if needs_i8_conversion(dtype): + new_left = type(self._left)._from_sequence(nc[:, 0], dtype=dtype) + new_right = type(self._right)._from_sequence(nc[:, 1], dtype=dtype) + else: + new_left = nc[:, 0].view(dtype) + new_right = nc[:, 1].view(dtype) + return self._shallow_copy(left=new_left, right=new_right) + + def unique(self) -> IntervalArray: + # Invalid index type "Tuple[slice, int]" for "Union[ExtensionArray, + # ndarray[Any, Any]]"; expected type "Union[int, integer[Any], slice, + # Sequence[int], ndarray[Any, Any]]" + nc = unique(self._combined.view("complex128")[:, 0]) # type: ignore[index] + nc = nc[:, None] + return self._from_combined(nc) + def _maybe_convert_platform_interval(values) -> ArrayLike: """ diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index d274501143916..1d78a74db98f0 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -403,15 +403,21 @@ def isin(self, values) -> BooleanArray: # type: ignore[override] from pandas.core.arrays import BooleanArray - result = isin(self._data, values) + # algorithms.isin will eventually convert values to an ndarray, so no extra + # cost to doing it here first + values_arr = np.asarray(values) + result = isin(self._data, values_arr) + if self._hasna: - if libmissing.NA in values: - result += self._mask - else: - result *= np.invert(self._mask) - # error: No overload variant of "zeros_like" matches argument types - # "BaseMaskedArray", "Type[bool]" - mask = np.zeros_like(self, dtype=bool) # type: ignore[call-overload] + values_have_NA = is_object_dtype(values_arr.dtype) and any( + val is self.dtype.na_value for val in values_arr + ) + + # For now, NA does not propagate so set result according to presence of NA, + # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion + result[self._mask] = values_have_NA + + mask = np.zeros(self._data.shape, dtype=bool) return BooleanArray(result, mask, copy=False) def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT: diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 04db06ee9fb66..471ee295ebd2f 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -341,7 +341,9 @@ def freq(self) -> BaseOffset: def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: if dtype == "i8": return self.asi8 - elif dtype == bool: + # error: Non-overlapping equality check (left operand type: "Optional[Union[str, + # dtype[Any]]]", right operand type: "Type[bool]") + elif dtype == bool: # type: ignore[comparison-overlap] return ~self._isnan # This will raise TypeError for non-object dtypes diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 7d3917203d7b6..e6e04050f08bf 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1463,11 +1463,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): return type(self)(result) def __abs__(self): - # error: Argument 1 to "__call__" of "ufunc" has incompatible type - # "SparseArray"; expected "Union[Union[int, float, complex, str, bytes, - # generic], Sequence[Union[int, float, complex, str, bytes, generic]], - # Sequence[Sequence[Any]], _SupportsArray]" - return np.abs(self) # type: ignore[arg-type] + return np.abs(self) # ------------------------------------------------------------------------ # Ops diff --git a/pandas/core/common.py b/pandas/core/common.py index 183607ebb489d..3a3af87ff788e 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -143,7 +143,11 @@ def is_bool_indexer(key: Any) -> bool: return True elif isinstance(key, list): # check if np.array(key).dtype would be bool - return len(key) > 0 and lib.is_bool_list(key) + if len(key) > 0: + if type(key) is not list: + # GH#42461 cython will raise TypeError if we pass a subclass + key = list(key) + return lib.is_bool_list(key) return False @@ -410,7 +414,7 @@ def random_state(state=None): Returns ------- - np.random.RandomState + np.random.RandomState or np.random if state is None """ if ( diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index f733a5c43dfb3..528c7f1a6af20 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -226,11 +226,7 @@ def stringify(value): if v not in metadata: result = -1 else: - # error: Incompatible types in assignment (expression has type - # "Union[Any, ndarray]", variable has type "int") - result = metadata.searchsorted( # type: ignore[assignment] - v, side="left" - ) + result = metadata.searchsorted(v, side="left") return TermValue(result, result, "integer") elif kind == "integer": v = int(float(v)) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 7e7205d1351b3..9d48bf612eb11 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -552,7 +552,6 @@ def sanitize_array( subarr = subarr.astype(dtype, copy=copy) elif copy: subarr = subarr.copy() - return subarr else: if isinstance(data, (set, frozenset)): @@ -560,8 +559,11 @@ def sanitize_array( raise TypeError(f"'{type(data).__name__}' type is unordered") # materialize e.g. generators, convert e.g. tuples, abc.ValueView - # TODO: non-standard array-likes we can convert to ndarray more efficiently? - data = list(data) + if hasattr(data, "__array__"): + # e.g. dask array GH#38645 + data = np.asarray(data) + else: + data = list(data) if dtype is not None or len(data) == 0: subarr = _try_cast(data, dtype, copy, raise_cast_failure) diff --git a/pandas/core/dtypes/base.py b/pandas/core/dtypes/base.py index e52b318c0b4f7..5b7dadac5d914 100644 --- a/pandas/core/dtypes/base.py +++ b/pandas/core/dtypes/base.py @@ -12,6 +12,7 @@ import numpy as np +from pandas._libs.hashtable import object_hash from pandas._typing import ( DtypeObj, type_t, @@ -128,7 +129,9 @@ def __eq__(self, other: Any) -> bool: return False def __hash__(self) -> int: - return hash(tuple(getattr(self, attr) for attr in self._metadata)) + # for python>=3.10, different nan objects have different hashes + # we need to avoid that und thus use hash function with old behavior + return object_hash(tuple(getattr(self, attr) for attr in self._metadata)) def __ne__(self, other: Any) -> bool: return not self.__eq__(other) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 433d45d94167d..66f835212212b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -15,6 +15,7 @@ TYPE_CHECKING, Any, Sized, + TypeVar, cast, overload, ) @@ -57,7 +58,6 @@ is_complex_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_datetime_or_timedelta_dtype, is_dtype_equal, is_extension_array_dtype, is_float, @@ -92,8 +92,6 @@ ) if TYPE_CHECKING: - from typing import Literal - from pandas.core.arrays import ( DatetimeArray, ExtensionArray, @@ -107,6 +105,8 @@ _int32_max = np.iinfo(np.int32).max _int64_max = np.iinfo(np.int64).max +NumpyArrayT = TypeVar("NumpyArrayT", bound=np.ndarray) + def maybe_convert_platform( values: list | tuple | range | np.ndarray | ExtensionArray, @@ -179,9 +179,7 @@ def maybe_box_native(value: Scalar) -> Scalar: ------- scalar or Series """ - if is_datetime_or_timedelta_dtype(value): - value = maybe_box_datetimelike(value) - elif is_float(value): + if is_float(value): # error: Argument 1 to "float" has incompatible type # "Union[Union[str, int, float, bool], Union[Any, Timestamp, Timedelta, Any]]"; # expected "Union[SupportsFloat, _SupportsIndex, str]" @@ -193,6 +191,8 @@ def maybe_box_native(value: Scalar) -> Scalar: value = int(value) # type: ignore[arg-type] elif is_bool(value): value = bool(value) + elif isinstance(value, (np.datetime64, np.timedelta64)): + value = maybe_box_datetimelike(value) return value @@ -659,7 +659,10 @@ def _ensure_dtype_type(value, dtype: np.dtype): object """ # Start with exceptions in which we do _not_ cast to numpy types - if dtype == np.object_: + + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[object_]") + if dtype == np.object_: # type: ignore[comparison-overlap] return value # Note: before we get here we have already excluded isna(value) @@ -775,6 +778,21 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, return dtype, val +def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]: + """ + Convert datetimelike-keyed dicts to a Timestamp-keyed dict. + + Parameters + ---------- + d: dict-like object + + Returns + ------- + dict + """ + return {maybe_box_datetimelike(key): value for key, value in d.items()} + + def infer_dtype_from_array( arr, pandas_dtype: bool = False ) -> tuple[DtypeObj, ArrayLike]: @@ -866,10 +884,10 @@ def maybe_infer_dtype_type(element): def maybe_upcast( - values: np.ndarray, + values: NumpyArrayT, fill_value: Scalar = np.nan, copy: bool = False, -) -> tuple[np.ndarray, Scalar]: +) -> tuple[NumpyArrayT, Scalar]: """ Provide explicit type promotion and coercion. @@ -1073,14 +1091,11 @@ def astype_nansafe( The dtype was a datetime64/timedelta64 dtype, but it had no unit. """ if arr.ndim > 1: - # Make sure we are doing non-copy ravel and reshape. - flags = arr.flags - flat = arr.ravel("K") + flat = arr.ravel() result = astype_nansafe(flat, dtype, copy=copy, skipna=skipna) - order: Literal["C", "F"] = "F" if flags.f_contiguous else "C" # error: Item "ExtensionArray" of "Union[ExtensionArray, ndarray]" has no # attribute "reshape" - return result.reshape(arr.shape, order=order) # type: ignore[union-attr] + return result.reshape(arr.shape) # type: ignore[union-attr] # We get here with 0-dim from sparse arr = np.atleast_1d(arr) @@ -1093,7 +1108,10 @@ def astype_nansafe( raise ValueError("dtype must be np.dtype or ExtensionDtype") if arr.dtype.kind in ["m", "M"] and ( - issubclass(dtype.type, str) or dtype == object + issubclass(dtype.type, str) + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[object]") + or dtype == object # type: ignore[comparison-overlap] ): from pandas.core.construction import ensure_wrapped_if_datetimelike @@ -1104,7 +1122,9 @@ def astype_nansafe( return lib.ensure_string_array(arr, skipna=skipna, convert_na_value=False) elif is_datetime64_dtype(arr): - if dtype == np.int64: + # Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + if dtype == np.int64: # type: ignore[comparison-overlap] warnings.warn( f"casting {arr.dtype} values to int64 with .astype(...) " "is deprecated and will raise in a future version. " @@ -1124,7 +1144,9 @@ def astype_nansafe( raise TypeError(f"cannot astype a datetimelike from [{arr.dtype}] to [{dtype}]") elif is_timedelta64_dtype(arr): - if dtype == np.int64: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + if dtype == np.int64: # type: ignore[comparison-overlap] warnings.warn( f"casting {arr.dtype} values to int64 with .astype(...) " "is deprecated and will raise in a future version. " @@ -1398,10 +1420,9 @@ def convert_dtypes( if is_string_dtype(inferred_dtype): if not convert_string: - inferred_dtype = input_array.dtype + return input_array.dtype else: - inferred_dtype = pandas_dtype("string") - return inferred_dtype + return pandas_dtype("string") if convert_integer: target_int_dtype = pandas_dtype("Int64") @@ -1454,7 +1475,9 @@ def convert_dtypes( else: return input_array.dtype - return inferred_dtype + # error: Incompatible return value type (got "Union[str, Union[dtype[Any], + # ExtensionDtype]]", expected "Union[dtype[Any], ExtensionDtype]") + return inferred_dtype # type: ignore[return-value] def maybe_infer_to_datetimelike( @@ -1831,7 +1854,9 @@ def construct_2d_arraylike_from_scalar( if dtype.kind in ["m", "M"]: value = maybe_unbox_datetimelike_tz_deprecation(value, dtype, stacklevel=4) - elif dtype == object: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[object]") + elif dtype == object: # type: ignore[comparison-overlap] if isinstance(value, (np.timedelta64, np.datetime64)): # calling np.array below would cast to pytimedelta/pydatetime out = np.empty(shape, dtype=object) @@ -2159,6 +2184,11 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: # ExtensionBlock._can_hold_element return True + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[object]") + if dtype == object: # type: ignore[comparison-overlap] + return True + tipo = maybe_infer_dtype_type(element) if dtype.kind in ["i", "u"]: @@ -2206,9 +2236,6 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: return tipo.kind == "b" return lib.is_bool(element) - elif dtype == object: - return True - elif dtype.kind == "S": # TODO: test tests.frame.methods.test_replace tests get here, # need more targeted tests. xref phofl has a PR about this diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 91b9bdd564676..47c6faa725774 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -158,6 +158,7 @@ from pandas.core.indexers import check_key_length from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( + CategoricalIndex, DatetimeIndex, Index, PeriodIndex, @@ -647,7 +648,7 @@ def __init__( elif isinstance(data, (np.ndarray, Series, Index)): if data.dtype.names: # i.e. numpy structured array - + data = cast(np.ndarray, data) mgr = rec_array_to_mgr( data, index, @@ -2279,9 +2280,7 @@ def to_records( if dtype_mapping is None: formats.append(v.dtype) elif isinstance(dtype_mapping, (type, np.dtype, str)): - # error: Argument 1 to "append" of "list" has incompatible type - # "Union[type, dtype, str]"; expected "dtype" - formats.append(dtype_mapping) # type: ignore[arg-type] + formats.append(dtype_mapping) else: element = "row" if i < index_len else "column" msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}" @@ -3345,8 +3344,8 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: values = self.values new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values] - result = self._constructor( - dict(zip(self.index, new_values)), index=self.columns + result = type(self)._from_arrays( + new_values, index=self.columns, columns=self.index ) else: @@ -3434,6 +3433,9 @@ def __getitem__(self, key): indexer = lib.maybe_indices_to_slice( indexer.astype(np.intp, copy=False), len(self) ) + if isinstance(indexer, np.ndarray): + # GH#43223 If we can not convert, use take + return self.take(indexer, axis=0) # either we have a slice or we have a string that can be converted # to a slice for partial-string date indexing return self._slice(indexer, axis=0) @@ -3553,6 +3555,11 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: Returns ------- scalar + + Notes + ----- + Assumes that index and columns both have ax._index_as_unique; + caller is responsible for checking. """ if takeable: series = self._ixs(col, axis=1) @@ -3561,20 +3568,21 @@ def _get_value(self, index, col, takeable: bool = False) -> Scalar: series = self._get_item_cache(col) engine = self.index._engine + if isinstance(self.index, CategoricalIndex): + # Trying to use the engine fastpath may give incorrect results + # if our categories are integers that dont match our codes + col = self.columns.get_loc(col) + index = self.index.get_loc(index) + return self._get_value(index, col, takeable=True) + try: loc = engine.get_loc(index) return series._values[loc] - except KeyError: - # GH 20629 - if self.index.nlevels > 1: - # partial indexing forbidden - raise - - # we cannot handle direct indexing - # use positional - col = self.columns.get_loc(col) - index = self.index.get_loc(index) - return self._get_value(index, col, takeable=True) + except AttributeError: + # IntervalTree has no get_loc + col = self.columns.get_loc(col) + index = self.index.get_loc(index) + return self._get_value(index, col, takeable=True) def __setitem__(self, key, value): key = com.apply_if_callable(key, self) @@ -3592,9 +3600,11 @@ def __setitem__(self, key, value): self._setitem_array(key, value) elif isinstance(value, DataFrame): self._set_item_frame_value(key, value) - elif is_list_like(value) and 1 < len( - self.columns.get_indexer_for([key]) - ) == len(value): + elif ( + is_list_like(value) + and not self.columns.is_unique + and 1 < len(self.columns.get_indexer_for([key])) == len(value) + ): # Column to set is duplicated self._setitem_array([key], value) else: @@ -3749,7 +3759,7 @@ def _set_item_mgr(self, key, value: ArrayLike) -> None: # try to set first as we want an invalid # value exception to occur first if len(self): - self._check_setitem_copy() + self._check_setitem_copy(stacklevel=5) def _iset_item(self, loc: int, value) -> None: arraylike = self._sanitize_column(value) @@ -5280,45 +5290,28 @@ def shift( axis = self._get_axis_number(axis) ncols = len(self.columns) + if axis == 1 and periods != 0 and fill_value is lib.no_default and ncols > 0: + # We will infer fill_value to match the closest column - if ( - axis == 1 - and periods != 0 - and ncols > 0 - and (fill_value is lib.no_default or len(self._mgr.arrays) > 1) - ): - # Exclude single-array-with-fill_value case so we issue a FutureWarning - # if an integer is passed with datetimelike dtype GH#31971 - from pandas import concat + # Use a column that we know is valid for our column's dtype GH#38434 + label = self.columns[0] - # tail: the data that is still in our shifted DataFrame if periods > 0: - tail = self.iloc[:, :-periods] + result = self.iloc[:, :-periods] + for col in range(min(ncols, abs(periods))): + # TODO(EA2D): doing this in a loop unnecessary with 2D EAs + # Define filler inside loop so we get a copy + filler = self.iloc[:, 0].shift(len(self)) + result.insert(0, label, filler, allow_duplicates=True) else: - tail = self.iloc[:, -periods:] - # pin a simple Index to avoid costly casting - tail.columns = range(len(tail.columns)) - - if fill_value is not lib.no_default: - # GH#35488 - # TODO(EA2D): with 2D EAs we could construct other directly - ser = Series(fill_value, index=self.index) - else: - # We infer fill_value to match the closest column - if periods > 0: - ser = self.iloc[:, 0].shift(len(self)) - else: - ser = self.iloc[:, -1].shift(len(self)) - - width = min(abs(periods), ncols) - other = concat([ser] * width, axis=1) - - if periods > 0: - result = concat([other, tail], axis=1) - else: - result = concat([tail, other], axis=1) + result = self.iloc[:, -periods:] + for col in range(min(ncols, abs(periods))): + # Define filler inside loop so we get a copy + filler = self.iloc[:, -1].shift(len(self)) + result.insert( + len(result.columns), label, filler, allow_duplicates=True + ) - result = cast(DataFrame, result) result.columns = self.columns.copy() return result @@ -6181,7 +6174,10 @@ def f(vals) -> tuple[np.ndarray, int]: return labels.astype("i8", copy=False), len(shape) if subset is None: - subset = self.columns + # Incompatible types in assignment + # (expression has type "Index", variable has type "Sequence[Any]") + # (pending on https://github.com/pandas-dev/pandas/issues/28770) + subset = self.columns # type: ignore[assignment] elif ( not np.iterable(subset) or isinstance(subset, str) @@ -6739,23 +6735,16 @@ def nsmallest(self, n, columns, keep: str = "first") -> DataFrame: self, n=n, keep=keep, columns=columns ).nsmallest() - def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: - """ - Swap levels i and j in a MultiIndex on a particular axis. - - Parameters - ---------- - i, j : int or str - Levels of the indices to be swapped. Can pass level name as string. - axis : {0 or 'index', 1 or 'columns'}, default 0 + @doc( + Series.swaplevel, + klass=_shared_doc_kwargs["klass"], + extra_params=dedent( + """axis : {0 or 'index', 1 or 'columns'}, default 0 The axis to swap levels on. 0 or 'index' for row-wise, 1 or - 'columns' for column-wise. - - Returns - ------- - DataFrame - - Examples + 'columns' for column-wise.""" + ), + examples=dedent( + """Examples -------- >>> df = pd.DataFrame( ... {"Grade": ["A", "B", "A", "C"]}, @@ -6804,8 +6793,10 @@ def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: History Final exam January A Geography Final exam February B History Coursework March A - Geography Coursework April C - """ + Geography Coursework April C""" + ), + ) + def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame: result = self.copy() axis = self._get_axis_number(axis) @@ -8144,7 +8135,11 @@ def stack(self, level: Level = -1, dropna: bool = True): return result.__finalize__(self, method="stack") - def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame: + def explode( + self, + column: str | tuple | list[str | tuple], + ignore_index: bool = False, + ) -> DataFrame: """ Transform each element of a list-like to a row, replicating index values. @@ -8152,8 +8147,15 @@ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame: Parameters ---------- - column : str or tuple - Column to explode. + column : str or tuple or list thereof + Column(s) to explode. + For multiple columns, specify a non-empty list with each element + be str or tuple, and all specified columns their list-like data + on same row of the frame must have matching length. + + .. versionadded:: 1.3.0 + Multi-column explode + ignore_index : bool, default False If True, the resulting index will be labeled 0, 1, …, n - 1. @@ -8168,7 +8170,10 @@ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame: Raises ------ ValueError : - if columns of the frame are not unique. + * If columns of the frame are not unique. + * If specified columns to explode is empty list. + * If specified columns to explode have not matching count of + elements rowwise in the frame. See Also -------- @@ -8187,32 +8192,69 @@ def explode(self, column: str | tuple, ignore_index: bool = False) -> DataFrame: Examples -------- - >>> df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1}) + >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]], + ... 'B': 1, + ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]}) >>> df - A B - 0 [1, 2, 3] 1 - 1 foo 1 - 2 [] 1 - 3 [3, 4] 1 + A B C + 0 [0, 1, 2] 1 [a, b, c] + 1 foo 1 NaN + 2 [] 1 [] + 3 [3, 4] 1 [d, e] + + Single-column explode. >>> df.explode('A') - A B - 0 1 1 - 0 2 1 - 0 3 1 - 1 foo 1 - 2 NaN 1 - 3 3 1 - 3 4 1 - """ - if not (is_scalar(column) or isinstance(column, tuple)): - raise ValueError("column must be a scalar") + A B C + 0 0 1 [a, b, c] + 0 1 1 [a, b, c] + 0 2 1 [a, b, c] + 1 foo 1 NaN + 2 NaN 1 [] + 3 3 1 [d, e] + 3 4 1 [d, e] + + Multi-column explode. + + >>> df.explode(list('AC')) + A B C + 0 0 1 a + 0 1 1 b + 0 2 1 c + 1 foo 1 NaN + 2 NaN 1 NaN + 3 3 1 d + 3 4 1 e + """ if not self.columns.is_unique: raise ValueError("columns must be unique") + columns: list[str | tuple] + if is_scalar(column) or isinstance(column, tuple): + assert isinstance(column, (str, tuple)) + columns = [column] + elif isinstance(column, list) and all( + map(lambda c: is_scalar(c) or isinstance(c, tuple), column) + ): + if not column: + raise ValueError("column must be nonempty") + if len(column) > len(set(column)): + raise ValueError("column must be unique") + columns = column + else: + raise ValueError("column must be a scalar, tuple, or list thereof") + df = self.reset_index(drop=True) - result = df[column].explode() - result = df.drop([column], axis=1).join(result) + if len(columns) == 1: + result = df[columns[0]].explode() + else: + mylen = lambda x: len(x) if is_list_like(x) else -1 + counts0 = self[columns[0]].apply(mylen) + for c in columns[1:]: + if not all(counts0 == self[c].apply(mylen)): + raise ValueError("columns must have matching element counts") + result = DataFrame({c: df[c].explode() for c in columns}) + result = df.drop(columns, axis=1).join(result) if ignore_index: result.index = ibase.default_index(len(result)) else: @@ -9303,7 +9345,8 @@ def corr( regardless of the callable's behavior. min_periods : int, optional Minimum number of observations required per pair of columns - to have a valid result. + to have a valid result. Currently only available for Pearson + and Spearman correlation. Returns ------- @@ -9337,9 +9380,7 @@ def corr( correl = libalgos.nancorr(mat, minp=min_periods) elif method == "spearman": correl = libalgos.nancorr_spearman(mat, minp=min_periods) - elif method == "kendall": - correl = libalgos.nancorr_kendall(mat, minp=min_periods) - elif callable(method): + elif method == "kendall" or callable(method): if min_periods is None: min_periods = 1 mat = mat.T diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5bd845534fc96..b92cf9751e4cf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -96,7 +96,10 @@ ABCDataFrame, ABCSeries, ) -from pandas.core.dtypes.inference import is_hashable +from pandas.core.dtypes.inference import ( + is_hashable, + is_nested_list_like, +) from pandas.core.dtypes.missing import ( isna, notna, @@ -1873,11 +1876,10 @@ def _drop_labels_or_levels(self, keys, axis: int = 0): # ---------------------------------------------------------------------- # Iteration - def __hash__(self) -> int: - raise TypeError( - f"{repr(type(self).__name__)} objects are mutable, " - f"thus they cannot be hashed" - ) + # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 + # Incompatible types in assignment (expression has type "None", base class + # "object" defined the type as "Callable[[object], int]") + __hash__: None # type: ignore[assignment] def __iter__(self): """ @@ -4185,6 +4187,7 @@ def _drop_axis( # Case for non-unique axis else: + is_tuple_labels = is_nested_list_like(labels) or isinstance(labels, tuple) labels = ensure_object(com.index_labels_to_array(labels)) if level is not None: if not isinstance(axis, MultiIndex): @@ -4194,9 +4197,14 @@ def _drop_axis( # GH 18561 MultiIndex.drop should raise if label is absent if errors == "raise" and indexer.all(): raise KeyError(f"{labels} not found in axis") - elif isinstance(axis, MultiIndex) and labels.dtype == "object": + elif ( + isinstance(axis, MultiIndex) + and labels.dtype == "object" + and not is_tuple_labels + ): # Set level to zero in case of MultiIndex and label is string, # because isin can't handle strings for MultiIndexes GH#36293 + # In case of tuples we get dtype object but have to use isin GH#42771 indexer = ~axis.get_level_values(0).isin(labels) else: indexer = ~axis.isin(labels) @@ -5143,11 +5151,12 @@ def tail(self: FrameOrSeries, n: int = 5) -> FrameOrSeries: def sample( self: FrameOrSeries, n=None, - frac=None, - replace=False, + frac: float | None = None, + replace: bool_t = False, weights=None, random_state=None, - axis=None, + axis: Axis | None = None, + ignore_index: bool_t = False, ) -> FrameOrSeries: """ Return a random sample of items from an axis of object. @@ -5189,6 +5198,10 @@ def sample( axis : {0 or ‘index’, 1 or ‘columns’, None}, default None Axis to sample. Accepts axis number or name. Default is stat axis for given data type (0 for Series and DataFrames). + ignore_index : bool, default False + If True, the resulting index will be labeled 0, 1, …, n - 1. + + .. versionadded:: 1.3.0 Returns ------- @@ -5350,7 +5363,11 @@ def sample( ) locs = rs.choice(axis_length, size=n, replace=replace, p=weights) - return self.take(locs, axis=axis) + result = self.take(locs, axis=axis) + if ignore_index: + result.index = ibase.default_index(len(result)) + + return result @final @doc(klass=_shared_doc_kwargs["klass"]) @@ -9751,11 +9768,9 @@ def abs(self: FrameOrSeries) -> FrameOrSeries: 2 6 30 -30 3 7 40 -50 """ - # error: Argument 1 to "__call__" of "ufunc" has incompatible type - # "FrameOrSeries"; expected "Union[Union[int, float, complex, str, bytes, - # generic], Sequence[Union[int, float, complex, str, bytes, generic]], - # Sequence[Sequence[Any]], _SupportsArray]" - return np.abs(self) # type: ignore[arg-type] + # error: Incompatible return value type (got "ndarray[Any, dtype[Any]]", + # expected "FrameOrSeries") + return np.abs(self) # type: ignore[return-value] @final def describe( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 69f992f840c7c..97e8832a8de43 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -759,7 +759,7 @@ def apply_series_value_counts(): # new values are where sorted labels change lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1)) inc = np.r_[True, lchanges] - if not len(lchanges): + if not len(val): inc = lchanges inc[idx] = True # group boundaries are also new values out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts @@ -1020,13 +1020,15 @@ def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs) if isinstance(sobj, Series): # GH#35246 test_groupby_as_index_select_column_sum_empty_df - result.columns = [sobj.name] + result.columns = self._obj_with_exclusions.columns.copy() else: + # Retain our column names + result.columns._set_names( + sobj.columns.names, level=list(range(sobj.columns.nlevels)) + ) # select everything except for the last level, which is the one # containing the name of the function(s), see GH#32040 - result.columns = result.columns.rename( - [sobj.columns.name] * result.columns.nlevels - ).droplevel(-1) + result.columns = result.columns.droplevel(-1) if not self.as_index: self._insert_inaxis_grouper_inplace(result) @@ -1665,7 +1667,7 @@ def _wrap_transformed_output( result.columns = self.obj.columns else: columns = Index(key.label for key in output) - columns.name = self.obj.columns.name + columns._set_names(self.obj._get_axis(1 - self.axis).names) result.columns = columns result.index = self.obj.index @@ -1674,7 +1676,9 @@ def _wrap_transformed_output( def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame: if not self.as_index: - index = Index(range(mgr.shape[1])) + # GH 41998 - empty mgr always gets index of length 0 + rows = mgr.shape[1] if mgr.shape[0] > 0 else 0 + index = Index(range(rows)) mgr.set_axis(1, index) result = self.obj._constructor(mgr) @@ -1800,7 +1804,6 @@ def nunique(self, dropna: bool = True) -> DataFrame: results = self._apply_to_column_groupbys( lambda sgb: sgb.nunique(dropna), obj=obj ) - results.columns.names = obj.columns.names # TODO: do at higher level? if not self.as_index: results.index = Index(range(len(results))) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index f694dcce809ea..ede5f5298b129 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -63,6 +63,7 @@ class providing the base-class of operations. from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, + is_float_dtype, is_integer_dtype, is_numeric_dtype, is_object_dtype, @@ -1012,7 +1013,11 @@ def reset_identity(values): if not not_indexed_same: result = concat(values, axis=self.axis) - ax = self.filter(lambda x: True).axes[self.axis] + ax = ( + self.filter(lambda x: True).axes[self.axis] + if self.dropna + else self._selected_obj._get_axis(self.axis) + ) # this is a very unfortunate situation # we can't use reindex to restore the original order @@ -1122,6 +1127,14 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: if self.obj.ndim == 2: # i.e. DataFrameGroupBy numeric_only = True + # GH#42395 GH#43108 GH#43154 + # Regression from 1.2.5 to 1.3 caused object columns to be dropped + obj = self._obj_with_exclusions + check = obj._get_numeric_data() + if len(obj.columns) and not len(check.columns) and not obj.empty: + numeric_only = False + # TODO: v1.4+ Add FutureWarning + else: numeric_only = False @@ -1143,9 +1156,15 @@ def _numba_prep(self, func, data): sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False) sorted_data = data.take(sorted_index, axis=self.axis).to_numpy() + sorted_index_data = data.index.take(sorted_index).to_numpy() starts, ends = lib.generate_slices(sorted_ids, ngroups) - return starts, ends, sorted_index, sorted_data + return ( + starts, + ends, + sorted_index_data, + sorted_data, + ) @final def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs): @@ -1519,7 +1538,11 @@ def _bool_agg(self, val_test, skipna): def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]: if is_object_dtype(vals): - vals = np.array([bool(x) for x in vals]) + # GH#37501: don't raise on pd.NA when skipna=True + if skipna: + vals = np.array([bool(x) if not isna(x) else True for x in vals]) + else: + vals = np.array([bool(x) for x in vals]) elif isinstance(vals, BaseMaskedArray): vals = vals._data.astype(bool, copy=False) else: @@ -2428,6 +2451,9 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: elif is_timedelta64_dtype(vals.dtype): inference = np.dtype("timedelta64[ns]") out = np.asarray(vals).astype(float) + elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): + inference = np.dtype(np.float64) + out = vals.to_numpy(dtype=float, na_value=np.nan) else: out = np.asarray(vals) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index c5d5d5a301336..cfe448fd52db2 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -833,9 +833,11 @@ def is_in_obj(gpr) -> bool: return False try: return gpr is obj[gpr.name] - except (KeyError, IndexError): + except (KeyError, IndexError, InvalidIndexError): # IndexError reached in e.g. test_skip_group_keys when we pass # lambda here + # InvalidIndexError raised on key-types inappropriate for index, + # e.g. DatetimeIndex.get_loc(tuple()) return False for gpr, level in zip(keys, levels): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index b65f26c7174fc..874d7395b1950 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -161,7 +161,9 @@ def _get_cython_function( f = getattr(libgroupby, ftype) if is_numeric: return f - elif dtype == object: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Literal['object']") + elif dtype == object: # type: ignore[comparison-overlap] if "object" not in f.__signatures__: # raise NotImplementedError here rather than TypeError later raise NotImplementedError( diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index eaba30012a5b8..4c5fd526523e6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -559,7 +559,9 @@ def _dtype_to_subclass(cls, dtype: DtypeObj): return Int64Index - elif dtype == object: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[object]") + elif dtype == object: # type: ignore[comparison-overlap] # NB: assuming away MultiIndex return Index @@ -3091,15 +3093,19 @@ def intersection(self, other, sort=False): return this.intersection(other, sort=sort) result = self._intersection(other, sort=sort) - return self._wrap_setop_result(other, result) + return self._wrap_intersection_result(other, result) def _intersection(self, other: Index, sort=False): """ intersection specialized to the case with matching dtypes. """ - # TODO(EA): setops-refactor, clean all this up - - if self.is_monotonic and other.is_monotonic: + if ( + self.is_monotonic + and other.is_monotonic + and not is_interval_dtype(self.dtype) + ): + # For IntervalIndex _inner_indexer is not more performant than get_indexer, + # so don't take this fastpath try: result = self._inner_indexer(other)[0] except TypeError: @@ -3113,6 +3119,10 @@ def _intersection(self, other: Index, sort=False): res_values = _maybe_try_sort(res_values, sort) return res_values + def _wrap_intersection_result(self, other, result): + # We will override for MultiIndex to handle empty results + return self._wrap_setop_result(other, result) + def _intersection_via_get_indexer(self, other: Index, sort) -> ArrayLike: """ Find the intersection of two Indexes using get_indexer. @@ -3122,10 +3132,8 @@ def _intersection_via_get_indexer(self, other: Index, sort) -> ArrayLike: np.ndarray or ExtensionArray The returned array will be unique. """ - # Note: drop_duplicates vs unique matters for MultiIndex, though - # it should not, see GH#41823 - left_unique = self.drop_duplicates() - right_unique = other.drop_duplicates() + left_unique = self.unique() + right_unique = other.unique() # even though we are unique, we need get_indexer_for for IntervalIndex indexer = left_unique.get_indexer_for(right_unique) @@ -3246,12 +3254,47 @@ def symmetric_difference(self, other, result_name=None, sort=None): if result_name is None: result_name = result_name_update - left = self.difference(other, sort=False) - right = other.difference(self, sort=False) - result = left.union(right, sort=sort) + if not self._should_compare(other): + return self.union(other, sort=sort).rename(result_name) + + elif not is_dtype_equal(self.dtype, other.dtype): + dtype = self._find_common_type_compat(other) + this = self.astype(dtype, copy=False) + that = other.astype(dtype, copy=False) + return this.symmetric_difference(that, sort=sort).rename(result_name) + + this = self.unique() + other = other.unique() + indexer = this.get_indexer_for(other) + + # {this} minus {other} + common_indexer = indexer.take((indexer != -1).nonzero()[0]) + left_indexer = np.setdiff1d( + np.arange(this.size), common_indexer, assume_unique=True + ) + left_diff = this._values.take(left_indexer) + + # {other} minus {this} + right_indexer = (indexer == -1).nonzero()[0] + right_diff = other._values.take(right_indexer) + + res_values = concat_compat([left_diff, right_diff]) + res_values = _maybe_try_sort(res_values, sort) + + result = Index(res_values, name=result_name) + + if self._is_multi: + self = cast("MultiIndex", self) + if len(result) == 0: + # On equal symmetric_difference MultiIndexes the difference is empty. + # Therefore, an empty MultiIndex is returned GH#13490 + return type(self)( + levels=[[] for _ in range(self.nlevels)], + codes=[[] for _ in range(self.nlevels)], + names=result.name, + ) + return type(self).from_tuples(result, names=result.name) - if result_name is not None: - result = result.rename(result_name) return result @final @@ -3403,6 +3446,37 @@ def get_indexer( # matched to Interval scalars return self._get_indexer_non_comparable(target, method=method, unique=True) + if is_categorical_dtype(self.dtype): + # _maybe_cast_listlike_indexer ensures target has our dtype + # (could improve perf by doing _should_compare check earlier?) + assert is_dtype_equal(self.dtype, target.dtype) + + indexer = self._engine.get_indexer(target.codes) + if self.hasnans and target.hasnans: + loc = self.get_loc(np.nan) + mask = target.isna() + indexer[mask] = loc + return indexer + + if is_categorical_dtype(target.dtype): + # potential fastpath + # get an indexer for unique categories then propagate to codes via take_nd + # get_indexer instead of _get_indexer needed for MultiIndex cases + # e.g. test_append_different_columns_types + categories_indexer = self.get_indexer(target.categories) + + indexer = algos.take_nd(categories_indexer, target.codes, fill_value=-1) + + if (not self._is_multi and self.hasnans) and target.hasnans: + # Exclude MultiIndex because hasnans raises NotImplementedError + # we should only get here if we are unique, so loc is an integer + # GH#41934 + loc = self.get_loc(np.nan) + mask = target.isna() + indexer[mask] = loc + + return ensure_platform_int(indexer) + pself, ptarget = self._maybe_promote(target) if pself is not self or ptarget is not target: return pself.get_indexer( @@ -4501,9 +4575,10 @@ def __contains__(self, key: Any) -> bool: except (OverflowError, TypeError, ValueError): return False - @final - def __hash__(self): - raise TypeError(f"unhashable type: {repr(type(self).__name__)}") + # https://github.com/python/typeshed/issues/2148#issuecomment-520783318 + # Incompatible types in assignment (expression has type "None", base class + # "object" defined the type as "Callable[[object], int]") + __hash__: None # type: ignore[assignment] @final def __setitem__(self, key, value): @@ -4648,8 +4723,7 @@ def putmask(self, mask, value) -> Index: values, mask.sum(), converted # type: ignore[arg-type] ) np.putmask(values, mask, converted) - - return type(self)._simple_new(values, name=self.name) + return self._shallow_copy(values) def equals(self, other: Any) -> bool: """ @@ -5303,6 +5377,13 @@ def _maybe_promote(self, other: Index) -> tuple[Index, Index]: if not is_object_dtype(self.dtype): return self.astype("object"), other.astype("object") + elif self.dtype.kind == "u" and other.dtype.kind == "i": + # GH#41873 + if other.min() >= 0: + # lookup min as it may be cached + # TODO: may need itemsize check if we have non-64-bit Indexes + return self, other.astype(self.dtype) + if not is_object_dtype(self.dtype) and is_object_dtype(other.dtype): # Reverse op so we dont need to re-implement on the subclasses other, self = other._maybe_promote(self) diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index 228f58d47b8ed..b13ae68f5b22d 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -193,17 +193,12 @@ def _can_hold_strings(self): def _engine_type(self): # self.codes can have dtype int8, int16, int32 or int64, so we need # to return the corresponding engine type (libindex.Int8Engine, etc.). - - # error: Invalid index type "Type[generic]" for "Dict[Type[signedinteger[Any]], - # Any]"; expected type "Type[signedinteger[Any]]" return { np.int8: libindex.Int8Engine, np.int16: libindex.Int16Engine, np.int32: libindex.Int32Engine, np.int64: libindex.Int64Engine, - }[ - self.codes.dtype.type # type: ignore[index] - ] + }[self.codes.dtype.type] _attributes = ["name"] @@ -483,7 +478,29 @@ def _reindex_non_unique( # type: ignore[override] # Indexing Methods def _maybe_cast_indexer(self, key) -> int: - return self._data._unbox_scalar(key) + # GH#41933: we have to do this instead of self._data._validate_scalar + # because this will correctly get partial-indexing on Interval categories + try: + return self._data._unbox_scalar(key) + except KeyError: + if is_valid_na_for_dtype(key, self.categories.dtype): + return -1 + raise + + def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex: + if isinstance(values, CategoricalIndex): + values = values._data + if isinstance(values, Categorical): + # Indexing on codes is more efficient if categories are the same, + # so we can apply some optimizations based on the degree of + # dtype-matching. + cat = self._data._encode_with_my_categories(values) + codes = cat._codes + else: + codes = self.categories.get_indexer(values) + codes = codes.astype(self.codes.dtype, copy=False) + cat = self._data._from_backing_data(codes) + return type(self)._simple_new(cat) def _get_indexer( self, diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index df7fae0763c42..2780ca08e9f89 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -26,7 +26,10 @@ Resolution, Tick, ) -from pandas._typing import Callable +from pandas._typing import ( + Callable, + final, +) from pandas.compat.numpy import function as nv from pandas.util._decorators import ( Appender, @@ -60,6 +63,7 @@ inherit_names, make_wrapped_arith_op, ) +from pandas.core.indexes.numeric import Int64Index from pandas.core.tools.timedeltas import to_timedelta if TYPE_CHECKING: @@ -399,6 +403,7 @@ def _validate_partial_date_slice(self, reso: Resolution): def _parsed_string_to_bounds(self, reso: Resolution, parsed: datetime): raise NotImplementedError + @final def _partial_date_slice( self, reso: Resolution, @@ -621,7 +626,7 @@ def _with_freq(self, freq): @property def _has_complex_internals(self) -> bool: # used to avoid libreduction code paths, which raise or require conversion - return False + return True def is_type_compatible(self, kind: str) -> bool: return kind in self._data._infer_matches @@ -683,7 +688,8 @@ def _can_fast_intersect(self: _T, other: _T) -> bool: elif self.freq.is_anchored(): # this along with matching freqs ensure that we "line up", # so intersection will preserve freq - return True + # GH#42104 + return self.freq.n == 1 elif isinstance(self.freq, Tick): # We "line up" if and only if the difference between two of our points @@ -692,7 +698,8 @@ def _can_fast_intersect(self: _T, other: _T) -> bool: remainder = diff % self.freq.delta return remainder == Timedelta(0) - return True + # GH#42104 + return self.freq.n == 1 def _can_fast_union(self: _T, other: _T) -> bool: # Assumes that type(self) == type(other), as per the annotation @@ -778,7 +785,11 @@ def _union(self, other, sort): # that result.freq == self.freq return result else: - return super()._union(other, sort=sort)._with_freq("infer") + i8self = Int64Index._simple_new(self.asi8) + i8other = Int64Index._simple_new(other.asi8) + i8result = i8self._union(i8other, sort=sort) + result = type(self)(i8result, dtype=self.dtype, freq="infer") + return result # -------------------------------------------------------------------- # Join Methods diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index fbfee9a1f524c..ff9a273267306 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -92,7 +92,8 @@ def _new_DatetimeIndex(cls, d): # These are already stored in our DatetimeArray; if they are # also in the pickle and don't match, we have a problem. if key in d: - assert d.pop(key) == getattr(dta, key) + assert d[key] == getattr(dta, key) + d.pop(key) result = cls._simple_new(dta, **d) else: with warnings.catch_warnings(): diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 6ff20f7d009bc..ccc1884dd7495 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -397,11 +397,13 @@ def astype(self, dtype, copy: bool = True) -> Index: return self return self.copy() + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Literal['M8[ns]']") if ( isinstance(self.dtype, np.dtype) and isinstance(dtype, np.dtype) and dtype.kind == "M" - and dtype != "M8[ns]" + and dtype != "M8[ns]" # type: ignore[comparison-overlap] ): # For now Datetime supports this by unwrapping ndarray, but DTI doesn't raise TypeError(f"Cannot cast {type(self).__name__} to dtype") diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 072ab7dff8e5b..94d7814151a25 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -7,10 +7,8 @@ ) import textwrap from typing import ( - TYPE_CHECKING, Any, Hashable, - cast, ) import numpy as np @@ -39,6 +37,7 @@ from pandas.util._exceptions import rewrite_exception from pandas.core.dtypes.cast import ( + construct_1d_object_array_from_listlike, find_common_type, infer_dtype_from_scalar, maybe_box_datetimelike, @@ -46,7 +45,6 @@ ) from pandas.core.dtypes.common import ( ensure_platform_int, - is_categorical_dtype, is_datetime64tz_dtype, is_datetime_or_timedelta_dtype, is_dtype_equal, @@ -63,7 +61,7 @@ from pandas.core.dtypes.dtypes import IntervalDtype from pandas.core.dtypes.missing import is_valid_na_for_dtype -from pandas.core.algorithms import take_nd +from pandas.core.algorithms import unique from pandas.core.arrays.interval import ( IntervalArray, _interval_shared_docs, @@ -91,9 +89,6 @@ timedelta_range, ) -if TYPE_CHECKING: - from pandas import CategoricalIndex - _index_doc_kwargs = dict(ibase._index_doc_kwargs) _index_doc_kwargs.update( @@ -668,11 +663,7 @@ def _get_indexer( left_indexer = self.left.get_indexer(target.left) right_indexer = self.right.get_indexer(target.right) indexer = np.where(left_indexer == right_indexer, left_indexer, -1) - elif is_categorical_dtype(target.dtype): - target = cast("CategoricalIndex", target) - # get an indexer for unique categories then propagate to codes via take_nd - categories_indexer = self.get_indexer(target.categories) - indexer = take_nd(categories_indexer, target.codes, fill_value=-1) + elif not is_object_dtype(target): # homogeneous scalar index: use IntervalTree target = self._maybe_convert_i8(target) @@ -802,6 +793,80 @@ def _format_data(self, name=None) -> str: # name argument is unused here; just for compat with base / categorical return self._data._format_data() + "," + self._format_space() + # -------------------------------------------------------------------- + # Set Operations + + def _intersection(self, other, sort): + """ + intersection specialized to the case with matching dtypes. + """ + # For IntervalIndex we also know other.closed == self.closed + if self.left.is_unique and self.right.is_unique: + taken = self._intersection_unique(other) + elif other.left.is_unique and other.right.is_unique and self.isna().sum() <= 1: + # Swap other/self if other is unique and self does not have + # multiple NaNs + taken = other._intersection_unique(self) + else: + # duplicates + taken = self._intersection_non_unique(other) + + if sort is None: + taken = taken.sort_values() + + return taken + + def _intersection_unique(self, other: IntervalIndex) -> IntervalIndex: + """ + Used when the IntervalIndex does not have any common endpoint, + no matter left or right. + Return the intersection with another IntervalIndex. + Parameters + ---------- + other : IntervalIndex + Returns + ------- + IntervalIndex + """ + # Note: this is much more performant than super()._intersection(other) + lindexer = self.left.get_indexer(other.left) + rindexer = self.right.get_indexer(other.right) + + match = (lindexer == rindexer) & (lindexer != -1) + indexer = lindexer.take(match.nonzero()[0]) + indexer = unique(indexer) + + return self.take(indexer) + + def _intersection_non_unique(self, other: IntervalIndex) -> IntervalIndex: + """ + Used when the IntervalIndex does have some common endpoints, + on either sides. + Return the intersection with another IntervalIndex. + + Parameters + ---------- + other : IntervalIndex + + Returns + ------- + IntervalIndex + """ + # Note: this is about 3.25x faster than super()._intersection(other) + # in IntervalIndexMethod.time_intersection_both_duplicate(1000) + mask = np.zeros(len(self), dtype=bool) + + if self.hasnans and other.hasnans: + first_nan_loc = np.arange(len(self))[self.isna()][0] + mask[first_nan_loc] = True + + other_tups = set(zip(other.left, other.right)) + for i, tup in enumerate(zip(self.left, self.right)): + if tup in other_tups: + mask[i] = True + + return self[mask] + # -------------------------------------------------------------------- @property @@ -812,6 +877,19 @@ def _is_all_dates(self) -> bool: """ return False + def _get_join_target(self) -> np.ndarray: + # constructing tuples is much faster than constructing Intervals + tups = list(zip(self.left, self.right)) + target = construct_1d_object_array_from_listlike(tups) + return target + + def _from_join_target(self, result): + left, right = list(zip(*result)) + arr = type(self._data).from_arrays( + left, right, dtype=self.dtype, closed=self.closed + ) + return type(self)._simple_new(arr, name=self.name) + # TODO: arithmetic operations diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 4dff63ea22e00..669969afd05a0 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3588,27 +3588,10 @@ def _maybe_match_names(self, other): names.append(None) return names - def _intersection(self, other, sort=False) -> MultiIndex: + def _wrap_intersection_result(self, other, result): other, result_names = self._convert_can_do_setop(other) - other = other.astype(object, copy=False) - uniq_tuples = None # flag whether _inner_indexer was successful - if self.is_monotonic and other.is_monotonic: - try: - inner_tuples = self._inner_indexer(other)[0] - sort = False # inner_tuples is already sorted - except TypeError: - pass - else: - uniq_tuples = algos.unique(inner_tuples) - - if uniq_tuples is None: - uniq_tuples = self._intersection_via_get_indexer(other, sort) - - if sort is None: - uniq_tuples = sorted(uniq_tuples) - - if len(uniq_tuples) == 0: + if len(result) == 0: return MultiIndex( levels=self.levels, codes=[[]] * self.nlevels, @@ -3616,22 +3599,12 @@ def _intersection(self, other, sort=False) -> MultiIndex: verify_integrity=False, ) else: - return MultiIndex.from_arrays( - zip(*uniq_tuples), sortorder=0, names=result_names - ) + return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names) def _difference(self, other, sort) -> MultiIndex: other, result_names = self._convert_can_do_setop(other) - this = self._get_unique_index() - - indexer = this.get_indexer(other) - indexer = indexer.take((indexer != -1).nonzero()[0]) - - label_diff = np.setdiff1d(np.arange(this.size), indexer, assume_unique=True) - difference = this._values.take(label_diff) - if sort is None: - difference = sorted(difference) + difference = super()._difference(other, sort) if len(difference) == 0: return MultiIndex( @@ -3663,18 +3636,6 @@ def _convert_can_do_setop(self, other): return other, result_names - def symmetric_difference(self, other, result_name=None, sort=None): - # On equal symmetric_difference MultiIndexes the difference is empty. - # Therefore, an empty MultiIndex is returned GH13490 - tups = Index.symmetric_difference(self, other, result_name, sort) - if len(tups) == 0: - return type(self)( - levels=[[] for _ in range(self.nlevels)], - codes=[[] for _ in range(self.nlevels)], - names=tups.names, - ) - return tups - # -------------------------------------------------------------------- @doc(Index.astype) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index ea2d5d9eec6ac..ce93cdff09ae0 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -370,6 +370,16 @@ class UInt64Index(IntegerIndex): _default_dtype = np.dtype(np.uint64) _dtype_validation_metadata = (is_unsigned_integer_dtype, "unsigned integer") + def _validate_fill_value(self, value): + # e.g. np.array([1]) we want np.array([1], dtype=np.uint64) + # see test_where_uin64 + super()._validate_fill_value(value) + if hasattr(value, "dtype") and is_signed_integer_dtype(value.dtype): + if (value >= 0).all(): + return value.astype(self.dtype) + raise TypeError + return value + class Float64Index(NumericIndex): _index_descr_args = { diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index c1104b80a0a7a..7917e0eff0227 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -15,6 +15,7 @@ ) from pandas._libs.tslibs import ( BaseOffset, + NaT, Period, Resolution, Tick, @@ -32,12 +33,12 @@ from pandas.core.dtypes.common import ( is_datetime64_any_dtype, - is_float, is_integer, is_scalar, pandas_dtype, ) from pandas.core.dtypes.dtypes import PeriodDtype +from pandas.core.dtypes.missing import is_valid_na_for_dtype from pandas.core.arrays.period import ( PeriodArray, @@ -414,7 +415,10 @@ def get_loc(self, key, method=None, tolerance=None): if not is_scalar(key): raise InvalidIndexError(key) - if isinstance(key, str): + if is_valid_na_for_dtype(key, self.dtype): + key = NaT + + elif isinstance(key, str): try: loc = self._get_string_slice(key) @@ -448,10 +452,25 @@ def get_loc(self, key, method=None, tolerance=None): else: key = asdt - elif is_integer(key): - # Period constructor will cast to string, which we dont want - raise KeyError(key) - elif isinstance(key, Period) and key.freq != self.freq: + elif isinstance(key, Period): + sfreq = self.freq + kfreq = key.freq + if not ( + sfreq.n == kfreq.n + and sfreq._period_dtype_code == kfreq._period_dtype_code + ): + # GH#42247 For the subset of DateOffsets that can be Period freqs, + # checking these two attributes is sufficient to check equality, + # and much more performant than `self.freq == key.freq` + raise KeyError(key) + elif isinstance(key, datetime): + try: + key = Period(key, freq=self.freq) + except ValueError as err: + # we cannot construct the Period + raise KeyError(orig_key) from err + else: + # in particular integer, which Period constructor would cast to string raise KeyError(key) try: @@ -493,13 +512,14 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default): elif isinstance(label, str): try: parsed, reso_str = parse_time_string(label, self.freq) - reso = Resolution.from_attrname(reso_str) - bounds = self._parsed_string_to_bounds(reso, parsed) - return bounds[0 if side == "left" else 1] except ValueError as err: # string cannot be parsed as datetime-like raise self._invalid_indexer("slice", label) from err - elif is_integer(label) or is_float(label): + + reso = Resolution.from_attrname(reso_str) + lower, upper = self._parsed_string_to_bounds(reso, parsed) + return lower if side == "left" else upper + elif not isinstance(label, self._data._recognized_scalars): raise self._invalid_indexer("slice", label) return label diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 746246172b967..8588f55f64389 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -730,6 +730,18 @@ def _difference(self, other, sort=None): new_index = new_index[::-1] return new_index + def symmetric_difference(self, other, result_name: Hashable = None, sort=None): + if not isinstance(other, RangeIndex) or sort is not None: + return super().symmetric_difference(other, result_name, sort) + + left = self.difference(other) + right = other.difference(self) + result = left.union(right) + + if result_name is not None: + result = result.rename(result_name) + return result + # -------------------------------------------------------------------- def _concat(self, indexes: list[Index], name: Hashable) -> Index: diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index c60ab06dd08f3..4d77f5ffc98e1 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -198,12 +198,16 @@ def _maybe_cast_slice_bound(self, label, side: str, kind=lib.no_default): self._deprecated_arg(kind, "kind", "_maybe_cast_slice_bound") if isinstance(label, str): - parsed = Timedelta(label) - lbound = parsed.round(parsed.resolution_string) - if side == "left": - return lbound - else: - return lbound + to_offset(parsed.resolution_string) - Timedelta(1, "ns") + try: + parsed = Timedelta(label) + except ValueError as err: + # e.g. 'unit abbreviation w/o a number' + raise self._invalid_indexer("slice", label) from err + + # The next two lines are analogous to DTI/PI._parsed_str_to_bounds + lower = parsed.round(parsed.resolution_string) + upper = lower + to_offset(parsed.resolution_string) - Timedelta(1, "ns") + return lower if side == "left" else upper elif not isinstance(label, self._data._recognized_scalars): raise self._invalid_indexer("slice", label) diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 3707e141bc447..d5a5baddfb197 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -42,8 +42,12 @@ isna, ) +from pandas.core import algorithms as algos import pandas.core.common as com -from pandas.core.construction import array as pd_array +from pandas.core.construction import ( + array as pd_array, + extract_array, +) from pandas.core.indexers import ( check_array_indexer, is_empty_indexer, @@ -916,8 +920,7 @@ def __getitem__(self, key): key = tuple(list(x) if is_iterator(x) else x for x in key) key = tuple(com.apply_if_callable(x, self.obj) for x in key) if self._is_scalar_access(key): - with suppress(KeyError, IndexError, AttributeError): - # AttributeError for IntervalTree get_value + with suppress(KeyError, IndexError): return self.obj._get_value(*key, takeable=self._takeable) return self._getitem_tuple(key) else: @@ -1004,7 +1007,7 @@ def _is_scalar_access(self, key: tuple) -> bool: # should not be considered scalar return False - if not ax.is_unique: + if not ax._index_as_unique: return False return True @@ -1662,6 +1665,21 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): if com.is_null_slice(indexer[0]): # We are setting an entire column self.obj[key] = value + return + elif is_array_like(value): + # GH#42099 + arr = extract_array(value, extract_numpy=True) + taker = -1 * np.ones(len(self.obj), dtype=np.intp) + empty_value = algos.take_nd(arr, taker) + if not isinstance(value, ABCSeries): + # if not Series (in which case we need to align), + # we can short-circuit + empty_value[indexer[0]] = arr + self.obj[key] = empty_value + return + + self.obj[key] = empty_value + else: self.obj[key] = infer_fill_value(value) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 237d06402a0ee..78ebfad4e3ae9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1167,13 +1167,17 @@ def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Blo # convert integer to float if necessary. need to do a lot more than # that, handle boolean etc also - # error: Argument 1 to "maybe_upcast" has incompatible type "Union[ndarray, - # ExtensionArray]"; expected "ndarray" + # error: Value of type variable "NumpyArrayT" of "maybe_upcast" cannot be + # "Union[ndarray[Any, Any], ExtensionArray]" new_values, fill_value = maybe_upcast( - self.values, fill_value # type: ignore[arg-type] + self.values, fill_value # type: ignore[type-var] ) - new_values = shift(new_values, periods, axis, fill_value) + # error: Argument 1 to "shift" has incompatible type "Union[ndarray[Any, Any], + # ExtensionArray]"; expected "ndarray[Any, Any]" + new_values = shift( + new_values, periods, axis, fill_value # type: ignore[arg-type] + ) return [self.make_block(new_values)] @@ -1553,6 +1557,16 @@ def _slice(self, slicer): return self.values[slicer] + @final + def getitem_block_index(self, slicer: slice) -> ExtensionBlock: + """ + Perform __getitem__-like specialized to slicing along index. + """ + # GH#42787 in principle this is equivalent to values[..., slicer], but we don't + # require subclasses of ExtensionArray to support that form (for now). + new_values = self.values[slicer] + return type(self)(new_values, self._mgr_locs, ndim=self.ndim) + def fillna( self, value, limit=None, inplace: bool = False, downcast=None ) -> list[Block]: @@ -1894,9 +1908,7 @@ def get_block_type(values, dtype: Dtype | None = None): cls = ExtensionBlock elif isinstance(dtype, CategoricalDtype): cls = CategoricalBlock - # error: Non-overlapping identity check (left operand type: "Type[generic]", - # right operand type: "Type[Timestamp]") - elif vtype is Timestamp: # type: ignore[comparison-overlap] + elif vtype is Timestamp: cls = DatetimeTZBlock elif isinstance(dtype, ExtensionDtype): # Note: need to be sure PandasArray is unwrapped before we get here diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 9642b30ab91ca..34d0137c26fda 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -123,12 +123,16 @@ def concat_arrays(to_concat: list) -> ArrayLike: # ignore the all-NA proxies to determine the resulting dtype to_concat_no_proxy = [x for x in to_concat if not isinstance(x, NullArrayProxy)] - single_dtype = len({x.dtype for x in to_concat_no_proxy}) == 1 + dtypes = {x.dtype for x in to_concat_no_proxy} + single_dtype = len(dtypes) == 1 - if not single_dtype: - target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) - else: + if single_dtype: target_dtype = to_concat_no_proxy[0].dtype + elif all(x.kind in ["i", "u", "b"] and isinstance(x, np.dtype) for x in dtypes): + # GH#42092 + target_dtype = np.find_common_type(list(dtypes), []) + else: + target_dtype = find_common_type([arr.dtype for arr in to_concat_no_proxy]) if target_dtype.kind in ["m", "M"]: # for datetimelike use DatetimeArray/TimedeltaArray concatenation @@ -592,6 +596,9 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: # e.g. DatetimeLikeBlock can be dt64 or td64, but these are not uniform all( is_dtype_equal(ju.block.dtype, join_units[0].block.dtype) + # GH#42092 we only want the dtype_equal check for non-numeric blocks + # (for now, may change but that would need a deprecation) + or ju.block.dtype.kind in ["b", "i", "u"] for ju in join_units ) and diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 81bf3ca4ba07a..8987dd5adf5d3 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -10,6 +10,7 @@ Any, Hashable, Sequence, + cast, ) import warnings @@ -25,6 +26,7 @@ from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, + dict_compat, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -58,7 +60,6 @@ TimedeltaArray, ) from pandas.core.construction import ( - create_series_with_explicit_dtype, ensure_wrapped_if_datetimelike, extract_array, range_to_ndarray, @@ -66,7 +67,9 @@ ) from pandas.core.indexes import base as ibase from pandas.core.indexes.api import ( + DatetimeIndex, Index, + TimedeltaIndex, ensure_index, get_objs_combined_axis, union_indexes, @@ -165,6 +168,9 @@ def rec_array_to_mgr( # fill if needed if isinstance(data, np.ma.MaskedArray): + # GH#42200 we only get here with MaskedRecords, but check for the + # parent class MaskedArray to avoid the need to import MaskedRecords + data = cast("MaskedRecords", data) new_arrays = fill_masked_arrays(data, arr_columns) else: # error: Incompatible types in assignment (expression has type @@ -552,6 +558,7 @@ def convert(v): def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: + oindex = None homogenized = [] for val in data: @@ -566,9 +573,18 @@ def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: val = val._values else: if isinstance(val, dict): - # see test_constructor_subclass_dict - # test_constructor_dict_datetime64_index - val = create_series_with_explicit_dtype(val, index=index)._values + # GH#41785 this _should_ be equivalent to (but faster than) + # val = create_series_with_explicit_dtype(val, index=index)._values + if oindex is None: + oindex = index.astype("O") + + if isinstance(index, (DatetimeIndex, TimedeltaIndex)): + # see test_constructor_dict_datetime64_index + val = dict_compat(val) + else: + # see test_constructor_subclass_dict + val = dict(val) + val = lib.fast_multiget(val, oindex._values, default=np.nan) val = sanitize_array( val, index, dtype=dtype, copy=False, raise_cast_failure=False @@ -745,6 +761,14 @@ def to_arrays( # i.e. numpy structured array columns = ensure_index(data.dtype.names) arrays = [data[name] for name in columns] + + if len(data) == 0: + # GH#42456 the indexing above results in list of 2D ndarrays + # TODO: is that an issue with numpy? + for i, arr in enumerate(arrays): + if arr.ndim == 2: + arrays[i] = arr[:, 0] + return arrays, columns return [], ensure_index([]) diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 48f0b7f7f964b..ea22cdaa89299 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -381,6 +381,29 @@ def shift(self: T, periods: int, axis: int, fill_value) -> T: if fill_value is lib.no_default: fill_value = None + if axis == 0 and self.ndim == 2 and self.nblocks > 1: + # GH#35488 we need to watch out for multi-block cases + # We only get here with fill_value not-lib.no_default + ncols = self.shape[0] + if periods > 0: + indexer = np.array( + [-1] * periods + list(range(ncols - periods)), dtype=np.intp + ) + else: + nper = abs(periods) + indexer = np.array( + list(range(nper, ncols)) + [-1] * nper, dtype=np.intp + ) + result = self.reindex_indexer( + self.items, + indexer, + axis=0, + fill_value=fill_value, + allow_dups=True, + consolidate=False, + ) + return result + return self.apply("shift", periods=periods, axis=axis, fill_value=fill_value) def fillna(self: T, value, limit, inplace: bool, downcast) -> T: @@ -575,6 +598,9 @@ def copy_func(ax): res = self.apply("copy", deep=deep) res.axes = new_axes + + if deep: + res._consolidate_inplace() return res def consolidate(self: T) -> T: @@ -1162,8 +1188,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: warnings.warn( "DataFrame is highly fragmented. This is usually the result " "of calling `frame.insert` many times, which has poor performance. " - "Consider using pd.concat instead. To get a de-fragmented frame, " - "use `newframe = frame.copy()`", + "Consider joining all columns at once using pd.concat(axis=1) " + "instead. To get a de-fragmented frame, use `newframe = frame.copy()`", PerformanceWarning, stacklevel=5, ) diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index ecdf2624c8ec1..3b03a28afe163 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -205,7 +205,7 @@ def _get_fill_value( else: if fill_value_typ == "+inf": # need the max int here - return np.iinfo(np.int64).max + return lib.i8max else: return iNaT @@ -376,7 +376,7 @@ def _wrap_results(result, dtype: np.dtype, fill_value=None): result = np.nan # raise if we have a timedelta64[ns] which is too large - if np.fabs(result) > np.iinfo(np.int64).max: + if np.fabs(result) > lib.i8max: raise ValueError("overflow in timedelta operation") result = Timedelta(result, unit="ns") @@ -603,7 +603,9 @@ def _mask_datetimelike_result( # we need to apply the mask result = result.astype("i8").view(orig_values.dtype) axis_mask = mask.any(axis=axis) - result[axis_mask] = iNaT + # error: Unsupported target for indexed assignment ("Union[ndarray[Any, Any], + # datetime64, timedelta64]") + result[axis_mask] = iNaT # type: ignore[index] else: if mask.any(): return NaT @@ -755,7 +757,10 @@ def get_median(x): def get_empty_reduction_result( - shape: tuple[int, ...], axis: int, dtype: np.dtype, fill_value: Any + shape: tuple[int, ...], + axis: int, + dtype: np.dtype | type[np.floating], + fill_value: Any, ) -> np.ndarray: """ The result from a reduction on an empty ndarray. @@ -809,9 +814,7 @@ def _get_counts_nanvar( """ dtype = get_dtype(dtype) count = _get_counts(values_shape, mask, axis, dtype=dtype) - # error: Unsupported operand types for - ("int" and "generic") - # error: Unsupported operand types for - ("float" and "generic") - d = count - dtype.type(ddof) # type: ignore[operator] + d = count - dtype.type(ddof) # always return NaN, never inf if is_scalar(count): @@ -1400,9 +1403,7 @@ def _get_counts( n = mask.size - mask.sum() else: n = np.prod(values_shape) - # error: Incompatible return value type (got "Union[Any, generic]", - # expected "Union[int, float, ndarray]") - return dtype.type(n) # type: ignore[return-value] + return dtype.type(n) if mask is not None: count = mask.shape[axis] - mask.sum(axis) @@ -1410,9 +1411,7 @@ def _get_counts( count = values_shape[axis] if is_scalar(count): - # error: Incompatible return value type (got "Union[Any, generic]", - # expected "Union[int, float, ndarray]") - return dtype.type(count) # type: ignore[return-value] + return dtype.type(count) try: return count.astype(dtype) except AttributeError: @@ -1758,7 +1757,7 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: if accum_func == np.minimum.accumulate: # Note: the accum_func comparison fails as an "is" comparison y = values.view("i8") - y[mask] = np.iinfo(np.int64).max + y[mask] = lib.i8max changed = True else: y = values @@ -1785,8 +1784,9 @@ def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike: # TODO: have this case go through a DTA method? # For DatetimeTZDtype, view result as M8[ns] npdtype = orig_dtype if isinstance(orig_dtype, np.dtype) else "M8[ns]" - # error: "Type[ExtensionArray]" has no attribute "_simple_new" - result = type(values)._simple_new( # type: ignore[attr-defined] + # Item "type" of "Union[Type[ExtensionArray], Type[ndarray[Any, Any]]]" + # has no attribute "_simple_new" + result = type(values)._simple_new( # type: ignore[union-attr] result.view(npdtype), dtype=orig_dtype ) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 76e23f1bf77e0..4aa4072e9de00 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -24,6 +24,7 @@ ) from pandas._typing import ( FrameOrSeries, + IndexLabel, T, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -1016,6 +1017,7 @@ class _GroupByMixin(PandasObject): """ _attributes: list[str] # in practice the same as Resampler._attributes + _selection: IndexLabel | None = None def __init__(self, obj, parent=None, groupby=None, **kwargs): # reached via ._gotitem and _get_resampler_for_grouping @@ -1027,6 +1029,7 @@ def __init__(self, obj, parent=None, groupby=None, **kwargs): # the resampler attributes for attr in self._attributes: setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) + self._selection = kwargs.get("selection") self.binner = parent.binner diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index ea34bc75b4e31..3f524a2da0011 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -362,8 +362,13 @@ def __init__( clean_keys.append(k) clean_objs.append(v) objs = clean_objs - name = getattr(keys, "name", None) - keys = Index(clean_keys, name=name) + + if isinstance(keys, MultiIndex): + # TODO: retain levels? + keys = type(keys).from_tuples(clean_keys, names=keys.names) + else: + name = getattr(keys, "name", None) + keys = Index(clean_keys, name=name) if len(objs) == 0: raise ValueError("All objects passed were None") @@ -500,7 +505,7 @@ def get_result(self): cons = sample._constructor_expanddim index, columns = self.new_axes - df = cons(data, index=index) + df = cons(data, index=index, copy=self.copy) df.columns = columns return df.__finalize__(self, method="concat") diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 6a0fad9ee729b..56814b7692292 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -21,6 +21,7 @@ from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.missing import notna +import pandas.core.algorithms as algos from pandas.core.arrays import Categorical import pandas.core.common as com from pandas.core.indexes.api import ( @@ -106,7 +107,7 @@ def melt( id_vars + value_vars ) else: - idx = frame.columns.get_indexer(id_vars + value_vars) + idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars)) frame = frame.iloc[:, idx] else: frame = frame.copy() diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 143999a4677b3..fd30dea23e1a4 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -69,6 +69,7 @@ Categorical, Index, MultiIndex, + Series, ) from pandas.core import groupby import pandas.core.algorithms as algos @@ -80,10 +81,7 @@ from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: - from pandas import ( - DataFrame, - Series, - ) + from pandas import DataFrame from pandas.core.arrays import DatetimeArray @@ -903,17 +901,22 @@ def _maybe_add_join_keys( # error: Item "bool" of "Union[Any, bool]" has no attribute "all" if mask_left.all(): # type: ignore[union-attr] key_col = Index(rvals) + result_dtype = rvals.dtype # error: Item "bool" of "Union[Any, bool]" has no attribute "all" elif ( right_indexer is not None and mask_right.all() # type: ignore[union-attr] ): key_col = Index(lvals) + result_dtype = lvals.dtype else: key_col = Index(lvals).where(~mask_left, rvals) + result_dtype = lvals.dtype if result._is_label_reference(name): - result[name] = key_col + result[name] = Series( + key_col, dtype=result_dtype, index=result.index + ) elif result._is_level_reference(name): if isinstance(result.index, MultiIndex): key_col.name = name @@ -1775,16 +1778,26 @@ def _validate_specification(self) -> None: raise MergeError("missing right_by") # GH#29130 Check that merge keys do not have dtype object - lo_dtype = ( - self.left[self.left_on[0]].dtype - if not self.left_index - else self.left.index.dtype - ) - ro_dtype = ( - self.right[self.right_on[0]].dtype - if not self.right_index - else self.right.index.dtype - ) + if not self.left_index: + left_on = self.left_on[0] + lo_dtype = ( + self.left[left_on].dtype + if left_on in self.left.columns + else self.left.index.get_level_values(left_on) + ) + else: + lo_dtype = self.left.index.dtype + + if not self.right_index: + right_on = self.right_on[0] + ro_dtype = ( + self.right[right_on].dtype + if right_on in self.right.columns + else self.right.index.get_level_values(right_on) + ) + else: + ro_dtype = self.right.index.dtype + if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype): raise MergeError( f"Incompatible merge dtype, {repr(ro_dtype)} and " diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 51556fda6da04..7a5c2677307e2 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -482,7 +482,7 @@ def pivot( if columns is None: raise TypeError("pivot() missing 1 required argument: 'columns'") - columns = com.convert_to_list_like(columns) + columns_listlike = com.convert_to_list_like(columns) if values is None: if index is not None: @@ -494,28 +494,27 @@ def pivot( # error: Unsupported operand types for + ("List[Any]" and "ExtensionArray") # error: Unsupported left operand type for + ("ExtensionArray") indexed = data.set_index( - cols + columns, append=append # type: ignore[operator] + cols + columns_listlike, append=append # type: ignore[operator] ) else: if index is None: - index = [Series(data.index, name=data.index.name)] + index_list = [Series(data.index, name=data.index.name)] else: - index = com.convert_to_list_like(index) - index = [data[idx] for idx in index] + index_list = [data[idx] for idx in com.convert_to_list_like(index)] - data_columns = [data[col] for col in columns] - index.extend(data_columns) - index = MultiIndex.from_arrays(index) + data_columns = [data[col] for col in columns_listlike] + index_list.extend(data_columns) + multiindex = MultiIndex.from_arrays(index_list) if is_list_like(values) and not isinstance(values, tuple): # Exclude tuple because it is seen as a single column name values = cast(Sequence[Hashable], values) indexed = data._constructor( - data[values]._values, index=index, columns=values + data[values]._values, index=multiindex, columns=values ) else: - indexed = data._constructor_sliced(data[values]._values, index=index) - return indexed.unstack(columns) + indexed = data._constructor_sliced(data[values]._values, index=multiindex) + return indexed.unstack(columns_listlike) def crosstab( diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 93859eb11dd44..9203cf625d43a 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -25,11 +25,13 @@ is_object_dtype, needs_i8_conversion, ) +from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.missing import notna import pandas.core.algorithms as algos from pandas.core.arrays import SparseArray from pandas.core.arrays.categorical import factorize_from_iterable +from pandas.core.construction import ensure_wrapped_if_datetimelike from pandas.core.frame import DataFrame from pandas.core.indexes.api import ( Index, @@ -233,15 +235,22 @@ def get_new_values(self, values, fill_value=None): if mask_all: dtype = values.dtype new_values = np.empty(result_shape, dtype=dtype) + name = np.dtype(dtype).name else: dtype, fill_value = maybe_promote(values.dtype, fill_value) - new_values = np.empty(result_shape, dtype=dtype) - new_values.fill(fill_value) + if isinstance(dtype, ExtensionDtype): + # GH#41875 + cls = dtype.construct_array_type() + new_values = cls._empty(result_shape, dtype=dtype) + new_values[:] = fill_value + name = dtype.name + else: + new_values = np.empty(result_shape, dtype=dtype) + new_values.fill(fill_value) + name = np.dtype(dtype).name new_mask = np.zeros(result_shape, dtype=bool) - name = np.dtype(dtype).name - # we need to convert to a basic dtype # and possibly coerce an input to our output dtype # e.g. ints -> floats @@ -267,6 +276,10 @@ def get_new_values(self, values, fill_value=None): # reconstruct dtype if needed if needs_i8_conversion(values.dtype): + # view as datetime64 so we can wrap in DatetimeArray and use + # DTA's view method + new_values = new_values.view("M8[ns]") + new_values = ensure_wrapped_if_datetimelike(new_values) new_values = new_values.view(values.dtype) return new_values, new_mask @@ -386,7 +399,8 @@ def _unstack_multiple(data, clocs, fill_value=None): return result - dummy = data.copy() + # GH#42579 deep=False to avoid consolidating + dummy = data.copy(deep=False) dummy.index = dummy_index unstacked = dummy.unstack("__placeholder__", fill_value=fill_value) @@ -1004,7 +1018,9 @@ def get_empty_frame(data) -> DataFrame: fill_value: bool | float | int if is_integer_dtype(dtype): fill_value = 0 - elif dtype == bool: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[bool]") + elif dtype == bool: # type: ignore[comparison-overlap] fill_value = False else: fill_value = 0.0 diff --git a/pandas/core/series.py b/pandas/core/series.py index 59ea6710ea6cd..ddbf4350a63c1 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -305,7 +305,6 @@ class Series(base.IndexOpsMixin, generic.NDFrame): hasnans = property( # type: ignore[assignment] base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__ ) - __hash__ = generic.NDFrame.__hash__ _mgr: SingleManager div: Callable[[Series, Any], Series] rdiv: Callable[[Series, Any], Series] @@ -388,9 +387,7 @@ def __init__( copy = False elif isinstance(data, np.ndarray): - # error: Argument 1 to "len" has incompatible type "dtype"; expected - # "Sized" - if len(data.dtype): # type: ignore[arg-type] + if len(data.dtype): # GH#13296 we are dealing with a compound dtype, which # should be treated as 2D raise ValueError( @@ -1231,14 +1228,15 @@ def _maybe_update_cacher( # a copy if ref is None: del self._cacher + elif len(self) == len(ref) and self.name in ref.columns: + # GH#42530 self.name must be in ref.columns + # to ensure column still in dataframe + # otherwise, either self or ref has swapped in new arrays + ref._maybe_cache_changed(cacher[0], self) else: - if len(self) == len(ref): - # otherwise, either self or ref has swapped in new arrays - ref._maybe_cache_changed(cacher[0], self) - else: - # GH#33675 we have swapped in a new array, so parent - # reference to self is now invalid - ref._item_cache.pop(cacher[0], None) + # GH#33675 we have swapped in a new array, so parent + # reference to self is now invalid + ref._item_cache.pop(cacher[0], None) super()._maybe_update_cacher(clear=clear, verify_is_copy=verify_is_copy) @@ -3865,6 +3863,65 @@ def nsmallest(self, n: int = 5, keep: str = "first") -> Series: """ return algorithms.SelectNSeries(self, n=n, keep=keep).nsmallest() + @doc( + klass=_shared_doc_kwargs["klass"], + extra_params=dedent( + """copy : bool, default True + Whether to copy underlying data.""" + ), + examples=dedent( + """Examples + -------- + >>> s = pd.Series( + ... ["A", "B", "A", "C"], + ... index=[ + ... ["Final exam", "Final exam", "Coursework", "Coursework"], + ... ["History", "Geography", "History", "Geography"], + ... ["January", "February", "March", "April"], + ... ], + ... ) + >>> s + Final exam History January A + Geography February B + Coursework History March A + Geography April C + dtype: object + + In the following example, we will swap the levels of the indices. + Here, we will swap the levels column-wise, but levels can be swapped row-wise + in a similar manner. Note that column-wise is the default behaviour. + By not supplying any arguments for i and j, we swap the last and second to + last indices. + + >>> s.swaplevel() + Final exam January History A + February Geography B + Coursework March History A + April Geography C + dtype: object + + By supplying one argument, we can choose which index to swap the last + index with. We can for example swap the first index with the last one as + follows. + + >>> s.swaplevel(0) + January History Final exam A + February Geography Final exam B + March History Coursework A + April Geography Coursework C + dtype: object + + We can also define explicitly which indices we want to swap by supplying values + for both i and j. Here, we for example swap the first and second indices. + + >>> s.swaplevel(0, 1) + History Final exam January A + Geography Final exam February B + History Coursework March A + Geography Coursework April C + dtype: object""" + ), + ) def swaplevel(self, i=-2, j=-1, copy=True) -> Series: """ Swap levels i and j in a :class:`MultiIndex`. @@ -3873,15 +3930,16 @@ def swaplevel(self, i=-2, j=-1, copy=True) -> Series: Parameters ---------- - i, j : int, str - Level of the indices to be swapped. Can pass level name as string. - copy : bool, default True - Whether to copy underlying data. + i, j : int or str + Levels of the indices to be swapped. Can pass level name as string. + {extra_params} Returns ------- - Series - Series with levels swapped in MultiIndex. + {klass} + {klass} with levels swapped in MultiIndex. + + {examples} """ assert isinstance(self.index, MultiIndex) new_index = self.index.swaplevel(i, j) @@ -4206,7 +4264,7 @@ def apply( convert_dtype : bool, default True Try to find better dtype for elementwise function results. If False, leave as dtype=object. Note that the dtype is always - preserved for extension array dtypes, such as Categorical. + preserved for some extension array dtypes, such as Categorical. args : tuple Positional arguments passed to func after the series value. **kwargs @@ -4969,7 +5027,7 @@ def isin(self, values) -> Series: self, method="isin" ) - def between(self, left, right, inclusive=True) -> Series: + def between(self, left, right, inclusive="both") -> Series: """ Return boolean Series equivalent to left <= series <= right. @@ -4983,8 +5041,10 @@ def between(self, left, right, inclusive=True) -> Series: Left boundary. right : scalar or list-like Right boundary. - inclusive : bool, default True - Include boundaries. + inclusive : {"both", "neither", "left", "right"} + Include boundaries. Whether to set each bound as closed or open. + + .. versionchanged:: 1.3.0 Returns ------- @@ -5015,9 +5075,9 @@ def between(self, left, right, inclusive=True) -> Series: 4 False dtype: bool - With `inclusive` set to ``False`` boundary values are excluded: + With `inclusive` set to ``"neither"`` boundary values are excluded: - >>> s.between(1, 4, inclusive=False) + >>> s.between(1, 4, inclusive="neither") 0 True 1 False 2 False @@ -5035,12 +5095,34 @@ def between(self, left, right, inclusive=True) -> Series: 3 False dtype: bool """ - if inclusive: + if inclusive is True or inclusive is False: + warnings.warn( + "Boolean inputs to the `inclusive` argument are deprecated in" + "favour of `both` or `neither`.", + FutureWarning, + stacklevel=2, + ) + if inclusive: + inclusive = "both" + else: + inclusive = "neither" + if inclusive == "both": lmask = self >= left rmask = self <= right - else: + elif inclusive == "left": + lmask = self >= left + rmask = self < right + elif inclusive == "right": + lmask = self > left + rmask = self <= right + elif inclusive == "neither": lmask = self > left rmask = self < right + else: + raise ValueError( + "Inclusive has to be either string of 'both'," + "'left', 'right', or 'neither'." + ) return lmask & rmask diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 8531f93fba321..712e9785f47f7 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -40,8 +40,6 @@ from pandas import MultiIndex from pandas.core.indexes.base import Index -_INT64_MAX = np.iinfo(np.int64).max - def get_indexer_indexer( target: Index, @@ -133,7 +131,7 @@ def _int64_cut_off(shape) -> int: acc = 1 for i, mul in enumerate(shape): acc *= int(mul) - if not acc < _INT64_MAX: + if not acc < lib.i8max: return i return len(shape) @@ -153,7 +151,7 @@ def maybe_lift(lab, size) -> tuple[np.ndarray, int]: labels = list(labels) # Iteratively process all the labels in chunks sized so less - # than _INT64_MAX unique int ids will be required for each chunk + # than lib.i8max unique int ids will be required for each chunk while True: # how many levels can be done without overflow: nlev = _int64_cut_off(lshape) @@ -215,7 +213,7 @@ def is_int64_overflow_possible(shape) -> bool: for x in shape: the_prod *= int(x) - return the_prod >= _INT64_MAX + return the_prod >= lib.i8max def decons_group_index(comp_labels, shape): diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 323cb6bd9fedd..d3cdcec9da8f1 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -3005,7 +3005,7 @@ def casefold(self): "isdigit", docstring=_shared_docs["ismethods"] % _doc_args["isdigit"] ) isspace = _map_and_wrap( - "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isalnum"] + "isspace", docstring=_shared_docs["ismethods"] % _doc_args["isspace"] ) islower = _map_and_wrap( "islower", docstring=_shared_docs["ismethods"] % _doc_args["islower"] diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 014a702618bda..7414fdf190936 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -194,9 +194,9 @@ def _maybe_cache( if len(unique_dates) < len(arg): cache_dates = convert_listlike(unique_dates, format) cache_array = Series(cache_dates, index=unique_dates) - if not cache_array.is_unique: - # GH#39882 in case of None and NaT we get duplicates - cache_array = cache_array.drop_duplicates() + # GH#39882 and GH#35888 in case of None and NaT we get duplicates + if not cache_array.index.is_unique: + cache_array = cache_array[~cache_array.index.duplicated()] return cache_array diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 6dfd67f5dc5ec..7d2bb75934c33 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -190,7 +190,7 @@ def to_numeric(arg, errors="raise", downcast=None): # attempt downcast only if the data has been successfully converted # to a numerical dtype and if a downcast method has been specified if downcast is not None and is_numeric_dtype(values.dtype): - typecodes = None + typecodes: str | None = None if downcast in ("integer", "signed"): typecodes = np.typecodes["Integer"] @@ -208,8 +208,8 @@ def to_numeric(arg, errors="raise", downcast=None): if typecodes is not None: # from smallest to largest - for dtype in typecodes: - dtype = np.dtype(dtype) + for typecode in typecodes: + dtype = np.dtype(typecode) if dtype.itemsize <= values.dtype.itemsize: values = maybe_downcast_numeric(values, dtype) diff --git a/pandas/core/util/hashing.py b/pandas/core/util/hashing.py index fb5002648b6a5..09213b9e37aa2 100644 --- a/pandas/core/util/hashing.py +++ b/pandas/core/util/hashing.py @@ -14,6 +14,7 @@ import numpy as np +from pandas._libs import lib from pandas._libs.hashing import hash_object_array from pandas._typing import ( ArrayLike, @@ -138,7 +139,10 @@ def hash_pandas_object( ser = Series(h, index=obj.index, dtype="uint64", copy=False) elif isinstance(obj, ABCDataFrame): - hashes = (hash_array(series._values) for _, series in obj.items()) + hashes = ( + hash_array(series._values, encoding, hash_key, categorize) + for _, series in obj.items() + ) num_items = len(obj.columns) if index: index_hash_generator = ( @@ -244,7 +248,7 @@ def _hash_categorical(cat: Categorical, encoding: str, hash_key: str) -> np.ndar result = np.zeros(len(mask), dtype="uint64") if mask.any(): - result[mask] = np.iinfo(np.uint64).max + result[mask] = lib.u8max return result diff --git a/pandas/core/window/doc.py b/pandas/core/window/doc.py index df69553a74683..b80a73a930818 100644 --- a/pandas/core/window/doc.py +++ b/pandas/core/window/doc.py @@ -94,8 +94,8 @@ def create_section_header(header: str) -> str: ).replace("\n", "", 1) numba_notes = ( - "See :ref:`window.numba_engine` for extended documentation " - "and performance considerations for the Numba engine.\n\n" + "See :ref:`window.numba_engine` and :ref:`enhancingperf.numba` for " + "extended documentation and performance considerations for the Numba engine.\n\n" ) window_agg_numba_parameters = dedent( diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index c1d532d94eb83..04c81ae756855 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -448,12 +448,14 @@ def mean(self, *args, engine=None, engine_kwargs=None, **kwargs): if engine_kwargs is not None: raise ValueError("cython engine does not accept engine_kwargs") nv.validate_window_func("mean", args, kwargs) + + deltas = None if self.times is None else self._deltas window_func = partial( window_aggregations.ewma, com=self._com, adjust=self.adjust, ignore_na=self.ignore_na, - deltas=self._deltas, + deltas=deltas, ) return self._apply(window_func) else: diff --git a/pandas/io/common.py b/pandas/io/common.py index 06b00a9cbb4eb..16747569e80b5 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -15,6 +15,7 @@ ) import mmap import os +import tempfile from typing import ( IO, Any, @@ -943,8 +944,15 @@ def _is_binary_mode(handle: FilePathOrBuffer, mode: str) -> bool: if "t" in mode or "b" in mode: return "b" in mode - # classes that expect string but have 'b' in mode - text_classes = (codecs.StreamWriter, codecs.StreamReader, codecs.StreamReaderWriter) + # exceptions + text_classes = ( + # classes that expect string but have 'b' in mode + codecs.StreamWriter, + codecs.StreamReader, + codecs.StreamReaderWriter, + # cannot be wrapped in TextIOWrapper GH43439 + tempfile.SpooledTemporaryFile, + ) if issubclass(type(handle), text_classes): return False diff --git a/pandas/io/excel/_odswriter.py b/pandas/io/excel/_odswriter.py index efef86329314b..fa2779b01d681 100644 --- a/pandas/io/excel/_odswriter.py +++ b/pandas/io/excel/_odswriter.py @@ -29,6 +29,7 @@ def __init__( storage_options: StorageOptions = None, if_sheet_exists: str | None = None, engine_kwargs: dict[str, Any] | None = None, + **kwargs, ): from odf.opendocument import OpenDocumentSpreadsheet diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index bc067e216760c..03c46f139eeca 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -19,7 +19,10 @@ BaseExcelReader, ExcelWriter, ) -from pandas.io.excel._util import validate_freeze_panes +from pandas.io.excel._util import ( + combine_kwargs, + validate_freeze_panes, +) if TYPE_CHECKING: from openpyxl.descriptors.serialisable import Serialisable @@ -39,10 +42,13 @@ def __init__( storage_options: StorageOptions = None, if_sheet_exists: str | None = None, engine_kwargs: dict[str, Any] | None = None, + **kwargs, ): # Use the openpyxl module as the Excel writer. from openpyxl.workbook import Workbook + engine_kwargs = combine_kwargs(engine_kwargs, kwargs) + super().__init__( path, mode=mode, diff --git a/pandas/io/excel/_util.py b/pandas/io/excel/_util.py index 7d8028de23257..66a66fbbcd78a 100644 --- a/pandas/io/excel/_util.py +++ b/pandas/io/excel/_util.py @@ -1,6 +1,9 @@ from __future__ import annotations -from typing import MutableMapping +from typing import ( + Any, + MutableMapping, +) from pandas.compat._optional import import_optional_dependency @@ -246,3 +249,30 @@ def pop_header_name(row, index_col): header_name = None if header_name == "" else header_name return header_name, row[:i] + [""] + row[i + 1 :] + + +def combine_kwargs(engine_kwargs: dict[str, Any] | None, kwargs: dict) -> dict: + """ + Used to combine two sources of kwargs for the backend engine. + + Use of kwargs is deprecated, this function is solely for use in 1.3 and should + be removed in 1.4/2.0. Also _base.ExcelWriter.__new__ ensures either engine_kwargs + or kwargs must be None or empty respectively. + + Parameters + ---------- + engine_kwargs: dict + kwargs to be passed through to the engine. + kwargs: dict + kwargs to be psased through to the engine (deprecated) + + Returns + ------- + engine_kwargs combined with kwargs + """ + if engine_kwargs is None: + result = {} + else: + result = engine_kwargs.copy() + result.update(kwargs) + return result diff --git a/pandas/io/excel/_xlsxwriter.py b/pandas/io/excel/_xlsxwriter.py index 7500a33b1f097..06c73f2c6199e 100644 --- a/pandas/io/excel/_xlsxwriter.py +++ b/pandas/io/excel/_xlsxwriter.py @@ -6,7 +6,10 @@ from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import validate_freeze_panes +from pandas.io.excel._util import ( + combine_kwargs, + validate_freeze_panes, +) class _XlsxStyler: @@ -175,11 +178,12 @@ def __init__( storage_options: StorageOptions = None, if_sheet_exists: str | None = None, engine_kwargs: dict[str, Any] | None = None, + **kwargs, ): # Use the xlsxwriter module as the Excel writer. from xlsxwriter import Workbook - engine_kwargs = engine_kwargs or {} + engine_kwargs = combine_kwargs(engine_kwargs, kwargs) if mode == "a": raise ValueError("Append mode is not supported with xlsxwriter!") diff --git a/pandas/io/excel/_xlwt.py b/pandas/io/excel/_xlwt.py index 8a7605b80f6b4..4dadf64b44515 100644 --- a/pandas/io/excel/_xlwt.py +++ b/pandas/io/excel/_xlwt.py @@ -9,7 +9,10 @@ from pandas._typing import StorageOptions from pandas.io.excel._base import ExcelWriter -from pandas.io.excel._util import validate_freeze_panes +from pandas.io.excel._util import ( + combine_kwargs, + validate_freeze_panes, +) if TYPE_CHECKING: from xlwt import XFStyle @@ -30,10 +33,13 @@ def __init__( storage_options: StorageOptions = None, if_sheet_exists: str | None = None, engine_kwargs: dict[str, Any] | None = None, + **kwargs, ): # Use the xlwt module as the Excel writer. import xlwt + engine_kwargs = combine_kwargs(engine_kwargs, kwargs) + if mode == "a": raise ValueError("Append mode is not supported with xlwt!") diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index d1c19f348f901..b5e05288845a9 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1635,24 +1635,10 @@ def format_percentiles( percentiles = 100 * percentiles - # error: Item "List[Union[int, float]]" of "Union[ndarray, List[Union[int, float]], - # List[float], List[Union[str, float]]]" has no attribute "astype" - # error: Item "List[float]" of "Union[ndarray, List[Union[int, float]], List[float], - # List[Union[str, float]]]" has no attribute "astype" - # error: Item "List[Union[str, float]]" of "Union[ndarray, List[Union[int, float]], - # List[float], List[Union[str, float]]]" has no attribute "astype" - int_idx = np.isclose( - percentiles.astype(int), percentiles # type: ignore[union-attr] - ) + int_idx = np.isclose(percentiles.astype(int), percentiles) if np.all(int_idx): - # error: Item "List[Union[int, float]]" of "Union[ndarray, List[Union[int, - # float]], List[float], List[Union[str, float]]]" has no attribute "astype" - # error: Item "List[float]" of "Union[ndarray, List[Union[int, float]], - # List[float], List[Union[str, float]]]" has no attribute "astype" - # error: Item "List[Union[str, float]]" of "Union[ndarray, List[Union[int, - # float]], List[float], List[Union[str, float]]]" has no attribute "astype" - out = percentiles.astype(int).astype(str) # type: ignore[union-attr] + out = percentiles.astype(int).astype(str) return [i + "%" for i in out] unique_pcts = np.unique(percentiles) diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 93c3843b36846..3358ec07eb819 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -31,7 +31,10 @@ from pandas.util._decorators import doc import pandas as pd -from pandas import RangeIndex +from pandas import ( + IndexSlice, + RangeIndex, +) from pandas.api.types import is_list_like from pandas.core import generic import pandas.core.common as com @@ -426,6 +429,7 @@ def to_latex( multicol_align: str = "r", siunitx: bool = False, encoding: str | None = None, + convert_css: bool = False, ): r""" Write Styler to a file, buffer or string in LaTeX format. @@ -482,6 +486,10 @@ def to_latex( Set to ``True`` to structure LaTeX compatible with the {siunitx} package. encoding : str, default "utf-8" Character encoding setting. + convert_css : bool, default False + Convert simple cell-styles from CSS to LaTeX format. Any CSS not found in + conversion table is dropped. A style can be forced by adding option + `--latex`. See notes. Returns ------- @@ -661,7 +669,48 @@ def to_latex( & ix2 & \$3 & 4.400 & CATS \\ L1 & ix3 & \$2 & 6.600 & COWS \\ \end{tabular} + + **CSS Conversion** + + This method can convert a Styler constructured with HTML-CSS to LaTeX using + the following limited conversions. + + ================== ==================== ============= ========================== + CSS Attribute CSS value LaTeX Command LaTeX Options + ================== ==================== ============= ========================== + font-weight | bold | bfseries + | bolder | bfseries + font-style | italic | itshape + | oblique | slshape + background-color | red cellcolor | {red}--lwrap + | #fe01ea | [HTML]{FE01EA}--lwrap + | #f0e | [HTML]{FF00EE}--lwrap + | rgb(128,255,0) | [rgb]{0.5,1,0}--lwrap + | rgba(128,0,0,0.5) | [rgb]{0.5,0,0}--lwrap + | rgb(25%,255,50%) | [rgb]{0.25,1,0.5}--lwrap + color | red color | {red} + | #fe01ea | [HTML]{FE01EA} + | #f0e | [HTML]{FF00EE} + | rgb(128,255,0) | [rgb]{0.5,1,0} + | rgba(128,0,0,0.5) | [rgb]{0.5,0,0} + | rgb(25%,255,50%) | [rgb]{0.25,1,0.5} + ================== ==================== ============= ========================== + + It is also possible to add user-defined LaTeX only styles to a HTML-CSS Styler + using the ``--latex`` flag, and to add LaTeX parsing options that the + converter will detect within a CSS-comment. + + >>> df = pd.DataFrame([[1]]) + >>> df.style.set_properties( + ... **{"font-weight": "bold /* --dwrap */", "Huge": "--latex--rwrap"} + ... ).to_latex(convert_css=True) + \begin{tabular}{lr} + {} & {0} \\ + 0 & {\bfseries}{\Huge{1}} \\ + \end{tabular} """ + obj = self._copy(deepcopy=True) # manipulate table_styles on obj, not self + table_selectors = ( [style["selector"] for style in self.table_styles] if self.table_styles is not None @@ -670,7 +719,7 @@ def to_latex( if column_format is not None: # add more recent setting to table_styles - self.set_table_styles( + obj.set_table_styles( [{"selector": "column_format", "props": f":{column_format}"}], overwrite=False, ) @@ -682,19 +731,19 @@ def to_latex( self.data.columns = RangeIndex(stop=len(self.data.columns)) numeric_cols = self.data._get_numeric_data().columns.to_list() self.data.columns = _original_columns - column_format = "" if self.hidden_index else "l" * self.data.index.nlevels + column_format = "" if self.hide_index_ else "l" * self.data.index.nlevels for ci, _ in enumerate(self.data.columns): if ci not in self.hidden_columns: column_format += ( ("r" if not siunitx else "S") if ci in numeric_cols else "l" ) - self.set_table_styles( + obj.set_table_styles( [{"selector": "column_format", "props": f":{column_format}"}], overwrite=False, ) if position: - self.set_table_styles( + obj.set_table_styles( [{"selector": "position", "props": f":{position}"}], overwrite=False, ) @@ -706,13 +755,13 @@ def to_latex( f"'raggedright', 'raggedleft', 'centering', " f"got: '{position_float}'" ) - self.set_table_styles( + obj.set_table_styles( [{"selector": "position_float", "props": f":{position_float}"}], overwrite=False, ) if hrules: - self.set_table_styles( + obj.set_table_styles( [ {"selector": "toprule", "props": ":toprule"}, {"selector": "midrule", "props": ":midrule"}, @@ -722,24 +771,25 @@ def to_latex( ) if label: - self.set_table_styles( + obj.set_table_styles( [{"selector": "label", "props": f":{{{label.replace(':', '§')}}}"}], overwrite=False, ) if caption: - self.set_caption(caption) + obj.set_caption(caption) if sparse_index is None: sparse_index = get_option("styler.sparse.index") if sparse_columns is None: sparse_columns = get_option("styler.sparse.columns") - latex = self._render_latex( + latex = obj._render_latex( sparse_index=sparse_index, sparse_columns=sparse_columns, multirow_align=multirow_align, multicol_align=multicol_align, + convert_css=convert_css, ) return save_to_buffer(latex, buf=buf, encoding=encoding) @@ -916,39 +966,60 @@ def _update_ctx(self, attrs: DataFrame) -> None: self.ctx[(i, j)].extend(css_list) def _copy(self, deepcopy: bool = False) -> Styler: - styler = Styler( - self.data, - precision=self.precision, - caption=self.caption, - table_attributes=self.table_attributes, - cell_ids=self.cell_ids, - na_rep=self.na_rep, - ) + """ + Copies a Styler, allowing for deepcopy or shallow copy - styler.uuid = self.uuid - styler.hidden_index = self.hidden_index + Copying a Styler aims to recreate a new Styler object which contains the same + data and styles as the original. - if deepcopy: - styler.ctx = copy.deepcopy(self.ctx) - styler._todo = copy.deepcopy(self._todo) - styler.table_styles = copy.deepcopy(self.table_styles) - styler.hidden_columns = copy.copy(self.hidden_columns) - styler.cell_context = copy.deepcopy(self.cell_context) - styler.tooltips = copy.deepcopy(self.tooltips) - else: - styler.ctx = self.ctx - styler._todo = self._todo - styler.table_styles = self.table_styles - styler.hidden_columns = self.hidden_columns - styler.cell_context = self.cell_context - styler.tooltips = self.tooltips + Data dependent attributes [copied and NOT exported]: + - formatting (._display_funcs) + - hidden index values or column values (.hidden_rows, .hidden_columns) + - tooltips + - cell_context (cell css classes) + - ctx (cell css styles) + - caption + + Non-data dependent attributes [copied and exported]: + - hidden index state and hidden columns state (.hide_index_, .hide_columns_) + - table_attributes + - table_styles + - applied styles (_todo) + + """ + # GH 40675 + styler = Styler( + self.data, # populates attributes 'data', 'columns', 'index' as shallow + uuid_len=self.uuid_len, + ) + shallow = [ # simple string or boolean immutables + "hide_index_", + "hide_columns_", + "table_attributes", + "cell_ids", + "caption", + ] + deep = [ # nested lists or dicts + "_display_funcs", + "hidden_rows", + "hidden_columns", + "ctx", + "cell_context", + "_todo", + "table_styles", + "tooltips", + ] + + for attr in shallow: + setattr(styler, attr, getattr(self, attr)) + + for attr in deep: + val = getattr(self, attr) + setattr(styler, attr, copy.deepcopy(val) if deepcopy else val) return styler def __copy__(self) -> Styler: - """ - Deep copy by default. - """ return self._copy(deepcopy=False) def __deepcopy__(self, memo) -> Styler: @@ -965,7 +1036,7 @@ def clear(self) -> None: self.cell_context.clear() self._todo.clear() - self.hidden_index = False + self.hide_index_ = False self.hidden_columns = [] # self.format and self.table_styles may be dependent on user # input in self.__init__() @@ -1096,7 +1167,7 @@ def _applymap( ) -> Styler: func = partial(func, **kwargs) # applymap doesn't take kwargs? if subset is None: - subset = pd.IndexSlice[:] + subset = IndexSlice[:] subset = non_reducing_slice(subset) result = self.data.loc[subset].applymap(func) self._update_ctx(result) @@ -1206,12 +1277,14 @@ def where( recommend using instead. The example: + >>> df = pd.DataFrame([[1, 2], [3, 4]]) >>> def cond(v, limit=4): ... return v > 1 and v != limit >>> df.style.where(cond, value='color:green;', other='color:red;') should be refactored to: + >>> def style_func(v, value, other, limit=4): ... cond = v > 1 and v != limit ... return value if cond else other @@ -1364,6 +1437,142 @@ def set_caption(self, caption: str | tuple) -> Styler: self.caption = caption return self + def set_sticky( + self, + axis: Axis = 0, + pixel_size: int | None = None, + levels: list[int] | None = None, + ) -> Styler: + """ + Add CSS to permanently display the index or column headers in a scrolling frame. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + Whether to make the index or column headers sticky. + pixel_size : int, optional + Required to configure the width of index cells or the height of column + header cells when sticking a MultiIndex (or with a named Index). + Defaults to 75 and 25 respectively. + levels : list of int + If ``axis`` is a MultiIndex the specific levels to stick. If ``None`` will + stick all levels. + + Returns + ------- + self : Styler + + Notes + ----- + This method uses the CSS 'position: sticky;' property to display. It is + designed to work with visible axes, therefore both: + + - `styler.set_sticky(axis="index").hide_index()` + - `styler.set_sticky(axis="columns").hide_columns()` + + may produce strange behaviour due to CSS controls with missing elements. + """ + if axis in [0, "index"]: + axis, obj = 0, self.data.index + pixel_size = 75 if not pixel_size else pixel_size + elif axis in [1, "columns"]: + axis, obj = 1, self.data.columns + pixel_size = 25 if not pixel_size else pixel_size + else: + raise ValueError("`axis` must be one of {0, 1, 'index', 'columns'}") + + props = "position:sticky; background-color:white;" + if not isinstance(obj, pd.MultiIndex): + # handling MultiIndexes requires different CSS + + if axis == 1: + # stick the first of and, if index names, the second + # if self._hide_columns then no here will exist: no conflict + styles: CSSStyles = [ + { + "selector": "thead tr:nth-child(1) th", + "props": props + "top:0px; z-index:2;", + } + ] + if not self.index.names[0] is None: + styles[0]["props"] = ( + props + f"top:0px; z-index:2; height:{pixel_size}px;" + ) + styles.append( + { + "selector": "thead tr:nth-child(2) th", + "props": props + + f"top:{pixel_size}px; z-index:2; height:{pixel_size}px; ", + } + ) + else: + # stick the first of each in both and + # if self._hide_index then no will exist in : no conflict + # but will exist in : conflict with initial element + styles = [ + { + "selector": "thead tr th:nth-child(1)", + "props": props + "left:0px; z-index:3 !important;", + }, + { + "selector": "tbody tr th:nth-child(1)", + "props": props + "left:0px; z-index:1;", + }, + ] + + else: + # handle the MultiIndex case + range_idx = list(range(obj.nlevels)) + levels = sorted(levels) if levels else range_idx + + if axis == 1: + styles = [] + for i, level in enumerate(levels): + styles.append( + { + "selector": f"thead tr:nth-child({level+1}) th", + "props": props + + ( + f"top:{i * pixel_size}px; height:{pixel_size}px; " + "z-index:2;" + ), + } + ) + if not all(name is None for name in self.index.names): + styles.append( + { + "selector": f"thead tr:nth-child({obj.nlevels+1}) th", + "props": props + + ( + f"top:{(i+1) * pixel_size}px; height:{pixel_size}px; " + "z-index:2;" + ), + } + ) + + else: + styles = [] + for i, level in enumerate(levels): + props_ = props + ( + f"left:{i * pixel_size}px; " + f"min-width:{pixel_size}px; " + f"max-width:{pixel_size}px; " + ) + styles.extend( + [ + { + "selector": f"thead tr th:nth-child({level+1})", + "props": props_ + "z-index:3 !important;", + }, + { + "selector": f"tbody tr th.level{level}", + "props": props_ + "z-index:1;", + }, + ] + ) + + return self.set_table_styles(styles, overwrite=False) + def set_table_styles( self, table_styles: dict[Any, CSSStyles] | CSSStyles, @@ -1509,37 +1718,169 @@ def set_na_rep(self, na_rep: str) -> StylerRenderer: self.na_rep = na_rep return self.format(na_rep=na_rep, precision=self.precision) - def hide_index(self) -> Styler: + def hide_index(self, subset: Subset | None = None) -> Styler: """ - Hide any indices from rendering. + Hide the entire index, or specific keys in the index from rendering. + + This method has dual functionality: + + - if ``subset`` is ``None`` then the entire index will be hidden whilst + displaying all data-rows. + - if a ``subset`` is given then those specific rows will be hidden whilst the + index itself remains visible. + + .. versionchanged:: 1.3.0 + + Parameters + ---------- + subset : label, array-like, IndexSlice, optional + A valid 1d input or single key along the index axis within + `DataFrame.loc[, :]`, to limit ``data`` to *before* applying + the function. Returns ------- self : Styler + + See Also + -------- + Styler.hide_columns: Hide the entire column headers row, or specific columns. + + Examples + -------- + Simple application hiding specific rows: + + >>> df = pd.DataFrame([[1,2], [3,4], [5,6]], index=["a", "b", "c"]) + >>> df.style.hide_index(["a", "b"]) + 0 1 + c 5 6 + + Hide the index and retain the data values: + + >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]]) + >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx) + >>> df.style.format("{:.1f}").hide_index() + x y + a b c a b c + 0.1 0.0 0.4 1.3 0.6 -1.4 + 0.7 1.0 1.3 1.5 -0.0 -0.2 + 1.4 -0.8 1.6 -0.2 -0.4 -0.3 + 0.4 1.0 -0.2 -0.8 -1.2 1.1 + -0.6 1.2 1.8 1.9 0.3 0.3 + 0.8 0.5 -0.3 1.2 2.2 -0.8 + + Hide specific rows but retain the index: + + >>> df.style.format("{:.1f}").hide_index(subset=(slice(None), ["a", "c"])) + x y + a b c a b c + x b 0.7 1.0 1.3 1.5 -0.0 -0.2 + y b -0.6 1.2 1.8 1.9 0.3 0.3 + + Hide specific rows and the index: + + >>> df.style.format("{:.1f}").hide_index(subset=(slice(None), ["a", "c"])) + ... .hide_index() + x y + a b c a b c + 0.7 1.0 1.3 1.5 -0.0 -0.2 + -0.6 1.2 1.8 1.9 0.3 0.3 """ - self.hidden_index = True + if subset is None: + self.hide_index_ = True + else: + subset_ = IndexSlice[subset, :] # new var so mypy reads not Optional + subset = non_reducing_slice(subset_) + hide = self.data.loc[subset] + hrows = self.index.get_indexer_for(hide.index) + # error: Incompatible types in assignment (expression has type + # "ndarray", variable has type "Sequence[int]") + self.hidden_rows = hrows # type: ignore[assignment] return self - def hide_columns(self, subset: Subset) -> Styler: + def hide_columns(self, subset: Subset | None = None) -> Styler: """ - Hide columns from rendering. + Hide the column headers or specific keys in the columns from rendering. + + This method has dual functionality: + + - if ``subset`` is ``None`` then the entire column headers row will be hidden + whilst the data-values remain visible. + - if a ``subset`` is given then those specific columns, including the + data-values will be hidden, whilst the column headers row remains visible. + + .. versionchanged:: 1.3.0 Parameters ---------- - subset : label, array-like, IndexSlice - A valid 1d input or single key along the appropriate axis within - `DataFrame.loc[]`, to limit ``data`` to *before* applying the function. + subset : label, array-like, IndexSlice, optional + A valid 1d input or single key along the columns axis within + `DataFrame.loc[:, ]`, to limit ``data`` to *before* applying + the function. Returns ------- self : Styler + + See Also + -------- + Styler.hide_index: Hide the entire index, or specific keys in the index. + + Examples + -------- + Simple application hiding specific columns: + + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) + >>> df.style.hide_columns(["a", "b"]) + c + 0 3 + 1 6 + + Hide column headers and retain the data values: + + >>> midx = pd.MultiIndex.from_product([["x", "y"], ["a", "b", "c"]]) + >>> df = pd.DataFrame(np.random.randn(6,6), index=midx, columns=midx) + >>> df.style.format("{:.1f}").hide_columns() + x d 0.1 0.0 0.4 1.3 0.6 -1.4 + e 0.7 1.0 1.3 1.5 -0.0 -0.2 + f 1.4 -0.8 1.6 -0.2 -0.4 -0.3 + y d 0.4 1.0 -0.2 -0.8 -1.2 1.1 + e -0.6 1.2 1.8 1.9 0.3 0.3 + f 0.8 0.5 -0.3 1.2 2.2 -0.8 + + Hide specific columns but retain the column headers: + + >>> df.style.format("{:.1f}").hide_columns(subset=(slice(None), ["a", "c"])) + x y + b b + x a 0.0 0.6 + b 1.0 -0.0 + c -0.8 -0.4 + y a 1.0 -1.2 + b 1.2 0.3 + c 0.5 2.2 + + Hide specific columns and the column headers: + + >>> df.style.format("{:.1f}").hide_columns(subset=(slice(None), ["a", "c"])) + ... .hide_columns() + x a 0.0 0.6 + b 1.0 -0.0 + c -0.8 -0.4 + y a 1.0 -1.2 + b 1.2 0.3 + c 0.5 2.2 """ - subset = non_reducing_slice(subset) - hidden_df = self.data.loc[subset] - hcols = self.columns.get_indexer_for(hidden_df.columns) - # error: Incompatible types in assignment (expression has type - # "ndarray", variable has type "Sequence[int]") - self.hidden_columns = hcols # type: ignore[assignment] + if subset is None: + self.hide_columns_ = True + else: + subset_ = IndexSlice[:, subset] # new var so mypy reads not Optional + subset = non_reducing_slice(subset_) + hide = self.data.loc[subset] + hcols = self.columns.get_indexer_for(hide.columns) + # error: Incompatible types in assignment (expression has type + # "ndarray", variable has type "Sequence[int]") + self.hidden_columns = hcols # type: ignore[assignment] return self # ----------------------------------------------------------------------- @@ -2004,15 +2345,15 @@ def highlight_max( Styler.highlight_quantile: Highlight values defined by a quantile with a style. """ - def f(data: FrameOrSeries, props: str) -> np.ndarray: - return np.where(data == np.nanmax(data.to_numpy()), props, "") - if props is None: props = f"background-color: {color};" # error: Argument 1 to "apply" of "Styler" has incompatible type # "Callable[[FrameOrSeries, str], ndarray]"; expected "Callable[..., Styler]" return self.apply( - f, axis=axis, subset=subset, props=props # type: ignore[arg-type] + partial(_highlight_value, op="max"), # type: ignore[arg-type] + axis=axis, + subset=subset, + props=props, ) def highlight_min( @@ -2055,15 +2396,15 @@ def highlight_min( Styler.highlight_quantile: Highlight values defined by a quantile with a style. """ - def f(data: FrameOrSeries, props: str) -> np.ndarray: - return np.where(data == np.nanmin(data.to_numpy()), props, "") - if props is None: props = f"background-color: {color};" # error: Argument 1 to "apply" of "Styler" has incompatible type # "Callable[[FrameOrSeries, str], ndarray]"; expected "Callable[..., Styler]" return self.apply( - f, axis=axis, subset=subset, props=props # type: ignore[arg-type] + partial(_highlight_value, op="min"), # type: ignore[arg-type] + axis=axis, + subset=subset, + props=props, ) def highlight_between( @@ -2282,23 +2623,35 @@ def highlight_quantile( ) @classmethod - def from_custom_template(cls, searchpath, name): + def from_custom_template( + cls, searchpath, html_table: str | None = None, html_style: str | None = None + ): """ Factory function for creating a subclass of ``Styler``. - Uses a custom template and Jinja environment. + Uses custom templates and Jinja environment. + + .. versionchanged:: 1.3.0 Parameters ---------- searchpath : str or list Path or paths of directories containing the templates. - name : str - Name of your custom template to use for rendering. + html_table : str + Name of your custom template to replace the html_table template. + + .. versionadded:: 1.3.0 + + html_style : str + Name of your custom template to replace the html_style template. + + .. versionadded:: 1.3.0 Returns ------- MyStyler : subclass of Styler - Has the correct ``env`` and ``template`` class attributes set. + Has the correct ``env``,``template_html``, ``template_html_table`` and + ``template_html_style`` class attributes set. """ loader = jinja2.ChoiceLoader([jinja2.FileSystemLoader(searchpath), cls.loader]) @@ -2307,7 +2660,10 @@ def from_custom_template(cls, searchpath, name): # error: Invalid base class "cls" class MyStyler(cls): # type:ignore[valid-type,misc] env = jinja2.Environment(loader=loader) - template_html = env.get_template(name) + if html_table: + template_html_table = env.get_template(html_table) + if html_style: + template_html_style = env.get_template(html_style) return MyStyler @@ -2547,3 +2903,13 @@ def _highlight_between( else np.full(data.shape, True, dtype=bool) ) return np.where(g_left & l_right, props, "") + + +def _highlight_value(data: FrameOrSeries, op: str, props: str) -> np.ndarray: + """ + Return an array of css strings based on the condition of values matching an op. + """ + value = getattr(data, op)(skipna=True) + if isinstance(data, DataFrame): # min/max must be done twice to return scalar + value = getattr(value, op)(skipna=True) + return np.where(data == value, props, "") diff --git a/pandas/io/formats/style_render.py b/pandas/io/formats/style_render.py index 7686d8a340c37..a2841ebcbab02 100644 --- a/pandas/io/formats/style_render.py +++ b/pandas/io/formats/style_render.py @@ -2,6 +2,7 @@ from collections import defaultdict from functools import partial +import re from typing import ( Any, Callable, @@ -66,6 +67,8 @@ class StylerRenderer: loader = jinja2.PackageLoader("pandas", "io/formats/templates") env = jinja2.Environment(loader=loader, trim_blocks=True) template_html = env.get_template("html.tpl") + template_html_table = env.get_template("html_table.tpl") + template_html_style = env.get_template("html_style.tpl") template_latex = env.get_template("latex.tpl") def __init__( @@ -97,7 +100,9 @@ def __init__( self.cell_ids = cell_ids # add rendering variables - self.hidden_index: bool = False + self.hide_index_: bool = False # bools for hiding col/row headers + self.hide_columns_: bool = False + self.hidden_rows: Sequence[int] = [] # sequence for specific hidden rows/cols self.hidden_columns: Sequence[int] = [] self.ctx: DefaultDict[tuple[int, int], CSSList] = defaultdict(list) self.cell_context: DefaultDict[tuple[int, int], str] = defaultdict(str) @@ -117,7 +122,11 @@ def _render_html(self, sparse_index: bool, sparse_columns: bool, **kwargs) -> st # TODO: namespace all the pandas keys d = self._translate(sparse_index, sparse_columns) d.update(kwargs) - return self.template_html.render(**d) + return self.template_html.render( + **d, + html_table_tpl=self.template_html_table, + html_style_tpl=self.template_html_style, + ) def _render_latex(self, sparse_index: bool, sparse_columns: bool, **kwargs) -> str: """ @@ -297,55 +306,57 @@ def _translate_header( head = [] # 1) column headers - for r in range(self.data.columns.nlevels): - index_blanks = [ - _element("th", blank_class, blank_value, not self.hidden_index) - ] * (self.data.index.nlevels - 1) - - name = self.data.columns.names[r] - column_name = [ - _element( - "th", - f"{blank_class if name is None else index_name_class} level{r}", - name if name is not None else blank_value, - not self.hidden_index, - ) - ] - - if clabels: - column_headers = [ + if not self.hide_columns_: + for r in range(self.data.columns.nlevels): + index_blanks = [ + _element("th", blank_class, blank_value, not self.hide_index_) + ] * (self.data.index.nlevels - 1) + + name = self.data.columns.names[r] + column_name = [ _element( "th", - f"{col_heading_class} level{r} col{c}", - value, - _is_visible(c, r, col_lengths), - attributes=( - f'colspan="{col_lengths.get((r, c), 0)}"' - if col_lengths.get((r, c), 0) > 1 - else "" - ), + f"{blank_class if name is None else index_name_class} level{r}", + name if name is not None else blank_value, + not self.hide_index_, ) - for c, value in enumerate(clabels[r]) ] - if len(self.data.columns) > max_cols: - # add an extra column with `...` value to indicate trimming - column_headers.append( + if clabels: + column_headers = [ _element( "th", - f"{col_heading_class} level{r} {trimmed_col_class}", - "...", - True, - attributes="", + f"{col_heading_class} level{r} col{c}", + value, + _is_visible(c, r, col_lengths), + attributes=( + f'colspan="{col_lengths.get((r, c), 0)}"' + if col_lengths.get((r, c), 0) > 1 + else "" + ), ) - ) - head.append(index_blanks + column_name + column_headers) + for c, value in enumerate(clabels[r]) + ] + + if len(self.data.columns) > max_cols: + # add an extra column with `...` value to indicate trimming + column_headers.append( + _element( + "th", + f"{col_heading_class} level{r} {trimmed_col_class}", + "...", + True, + attributes="", + ) + ) + head.append(index_blanks + column_name + column_headers) # 2) index names if ( self.data.index.names and com.any_not_none(*self.data.index.names) - and not self.hidden_index + and not self.hide_index_ + and not self.hide_columns_ ): index_names = [ _element( @@ -411,7 +422,9 @@ def _translate_body( The associated HTML elements needed for template rendering. """ # for sparsifying a MultiIndex - idx_lengths = _get_level_lengths(self.index, sparsify_index, max_rows) + idx_lengths = _get_level_lengths( + self.index, sparsify_index, max_rows, self.hidden_rows + ) rlabels = self.data.index.tolist()[:max_rows] # slice to allow trimming if self.data.index.nlevels == 1: @@ -425,7 +438,7 @@ def _translate_body( "th", f"{row_heading_class} level{c} {trimmed_row_class}", "...", - not self.hidden_index, + not self.hide_index_, attributes="", ) for c in range(self.data.index.nlevels) @@ -462,7 +475,7 @@ def _translate_body( "th", f"{row_heading_class} level{c} row{r}", value, - (_is_visible(r, c, idx_lengths) and not self.hidden_index), + (_is_visible(r, c, idx_lengths) and not self.hide_index_), id=f"level{c}_row{r}", attributes=( f'rowspan="{idx_lengths.get((c, r), 0)}"' @@ -496,7 +509,7 @@ def _translate_body( "td", f"{data_class} row{r} col{c}{cls}", value, - (c not in self.hidden_columns), + (c not in self.hidden_columns and r not in self.hidden_rows), attributes="", display_value=self._display_funcs[(r, c)](value), ) @@ -527,7 +540,7 @@ def _translate_latex(self, d: dict) -> None: d["head"] = [[col for col in row if col["is_visible"]] for row in d["head"]] body = [] for r, row in enumerate(d["body"]): - if self.hidden_index: + if self.hide_index_: row_body_headers = [] else: row_body_headers = [ @@ -627,7 +640,7 @@ def format( to. If the ``formatter`` argument is given in dict form but does not include all columns within the subset then these columns will have the default formatter applied. Any columns in the formatter dict excluded from the subset will - raise a ``KeyError``. + be ignored. When using a ``formatter`` string the dtypes must be compatible, otherwise a `ValueError` will be raised. @@ -842,7 +855,13 @@ def _get_level_lengths( last_label = j lengths[(i, last_label)] = 0 elif j not in hidden_elements: - lengths[(i, last_label)] += 1 + if lengths[(i, last_label)] == 0: + # if the previous iteration was first-of-kind but hidden then offset + last_label = j + lengths[(i, last_label)] = 1 + else: + # else add to previous iteration + lengths[(i, last_label)] += 1 non_zero_lengths = { element: length for element, length in lengths.items() if length >= 1 @@ -1253,7 +1272,9 @@ def _parse_latex_table_styles(table_styles: CSSStyles, selector: str) -> str | N return None -def _parse_latex_cell_styles(latex_styles: CSSList, display_value: str) -> str: +def _parse_latex_cell_styles( + latex_styles: CSSList, display_value: str, convert_css: bool = False +) -> str: r""" Mutate the ``display_value`` string including LaTeX commands from ``latex_styles``. @@ -1279,6 +1300,8 @@ def _parse_latex_cell_styles(latex_styles: CSSList, display_value: str) -> str: For example for styles: `[('c1', 'o1--wrap'), ('c2', 'o2')]` this returns: `{\c1o1 \c2o2{display_value}} """ + if convert_css: + latex_styles = _parse_latex_css_conversion(latex_styles) for (command, options) in latex_styles[::-1]: # in reverse for most recent style formatter = { "--wrap": f"{{\\{command}--to_parse {display_value}}}", @@ -1351,6 +1374,82 @@ def _parse_latex_options_strip(value: str | int | float, arg: str) -> str: return str(value).replace(arg, "").replace("/*", "").replace("*/", "").strip() +def _parse_latex_css_conversion(styles: CSSList) -> CSSList: + """ + Convert CSS (attribute,value) pairs to equivalent LaTeX (command,options) pairs. + + Ignore conversion if tagged with `--latex` option, skipped if no conversion found. + """ + + def font_weight(value, arg): + if value == "bold" or value == "bolder": + return "bfseries", f"{arg}" + return None + + def font_style(value, arg): + if value == "italic": + return "itshape", f"{arg}" + elif value == "oblique": + return "slshape", f"{arg}" + return None + + def color(value, user_arg, command, comm_arg): + """ + CSS colors have 5 formats to process: + + - 6 digit hex code: "#ff23ee" --> [HTML]{FF23EE} + - 3 digit hex code: "#f0e" --> [HTML]{FF00EE} + - rgba: rgba(128, 255, 0, 0.5) --> [rgb]{0.502, 1.000, 0.000} + - rgb: rgb(128, 255, 0,) --> [rbg]{0.502, 1.000, 0.000} + - string: red --> {red} + + Additionally rgb or rgba can be expressed in % which is also parsed. + """ + arg = user_arg if user_arg != "" else comm_arg + + if value[0] == "#" and len(value) == 7: # color is hex code + return command, f"[HTML]{{{value[1:].upper()}}}{arg}" + if value[0] == "#" and len(value) == 4: # color is short hex code + val = f"{value[1].upper()*2}{value[2].upper()*2}{value[3].upper()*2}" + return command, f"[HTML]{{{val}}}{arg}" + elif value[:3] == "rgb": # color is rgb or rgba + r = re.findall("(?<=\\()[0-9\\s%]+(?=,)", value)[0].strip() + r = float(r[:-1]) / 100 if "%" in r else int(r) / 255 + g = re.findall("(?<=,)[0-9\\s%]+(?=,)", value)[0].strip() + g = float(g[:-1]) / 100 if "%" in g else int(g) / 255 + if value[3] == "a": # color is rgba + b = re.findall("(?<=,)[0-9\\s%]+(?=,)", value)[1].strip() + else: # color is rgb + b = re.findall("(?<=,)[0-9\\s%]+(?=\\))", value)[0].strip() + b = float(b[:-1]) / 100 if "%" in b else int(b) / 255 + return command, f"[rgb]{{{r:.3f}, {g:.3f}, {b:.3f}}}{arg}" + else: + return command, f"{{{value}}}{arg}" # color is likely string-named + + CONVERTED_ATTRIBUTES: dict[str, Callable] = { + "font-weight": font_weight, + "background-color": partial(color, command="cellcolor", comm_arg="--lwrap"), + "color": partial(color, command="color", comm_arg=""), + "font-style": font_style, + } + + latex_styles: CSSList = [] + for (attribute, value) in styles: + if isinstance(value, str) and "--latex" in value: + # return the style without conversion but drop '--latex' + latex_styles.append((attribute, value.replace("--latex", ""))) + if attribute in CONVERTED_ATTRIBUTES.keys(): + arg = "" + for x in ["--wrap", "--nowrap", "--lwrap", "--dwrap", "--rwrap"]: + if x in str(value): + arg, value = x, _parse_latex_options_strip(value, x) + break + latex_style = CONVERTED_ATTRIBUTES[attribute](value, arg) + if latex_style is not None: + latex_styles.extend([latex_style]) + return latex_styles + + def _escape_latex(s): r""" Replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, ``{``, ``}``, diff --git a/pandas/io/formats/templates/html.tpl b/pandas/io/formats/templates/html.tpl index 880c78c8d6b05..8c63be3ad788a 100644 --- a/pandas/io/formats/templates/html.tpl +++ b/pandas/io/formats/templates/html.tpl @@ -1,16 +1,16 @@ -{# Update the template_structure.html documentation too #} +{# Update the html_style/table_structure.html documentation too #} {% if doctype_html %} -{% if not exclude_styles %}{% include "html_style.tpl" %}{% endif %} +{% if not exclude_styles %}{% include html_style_tpl %}{% endif %} -{% include "html_table.tpl" %} +{% include html_table_tpl %} {% elif not doctype_html %} -{% if not exclude_styles %}{% include "html_style.tpl" %}{% endif %} -{% include "html_table.tpl" %} +{% if not exclude_styles %}{% include html_style_tpl %}{% endif %} +{% include html_table_tpl %} {% endif %} diff --git a/pandas/io/formats/templates/latex.tpl b/pandas/io/formats/templates/latex.tpl index 66fe99642850f..fe081676d87af 100644 --- a/pandas/io/formats/templates/latex.tpl +++ b/pandas/io/formats/templates/latex.tpl @@ -39,7 +39,7 @@ {% endif %} {% for row in body %} {% for c in row %}{% if not loop.first %} & {% endif %} - {%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align)}}{% else %}{{parse_cell(c.cellstyle, c.display_value)}}{% endif %} + {%- if c.type == 'th' %}{{parse_header(c, multirow_align, multicol_align)}}{% else %}{{parse_cell(c.cellstyle, c.display_value, convert_css)}}{% endif %} {%- endfor %} \\ {% endfor %} {% set bottomrule = parse_table(table_styles, 'bottomrule') %} diff --git a/pandas/io/formats/xml.py b/pandas/io/formats/xml.py index 5be6ae0382d87..c8454e88970fa 100644 --- a/pandas/io/formats/xml.py +++ b/pandas/io/formats/xml.py @@ -195,14 +195,18 @@ def handle_indexes(self) -> None: This method will add indexes into attr_cols or elem_cols. """ + if not self.index: + return + + first_key = next(iter(self.frame_dicts)) indexes: list[str] = [ - x for x in self.frame_dicts[0].keys() if x not in self.orig_cols + x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols ] - if self.attr_cols and self.index: + if self.attr_cols: self.attr_cols = indexes + self.attr_cols - if self.elem_cols and self.index: + if self.elem_cols: self.elem_cols = indexes + self.elem_cols def get_prefix_uri(self) -> str: @@ -307,7 +311,7 @@ def build_tree(self) -> bytes: self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") if not self.attr_cols and not self.elem_cols: - self.elem_cols = list(self.frame_dicts[0].keys()) + self.elem_cols = list(self.d.keys()) self.build_elems() else: @@ -477,7 +481,7 @@ def build_tree(self) -> bytes: self.elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}") if not self.attr_cols and not self.elem_cols: - self.elem_cols = list(self.frame_dicts[0].keys()) + self.elem_cols = list(self.d.keys()) self.build_elems() else: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index b7523fada07d0..49384cfb2e554 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -309,14 +309,17 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): + parquet_kwargs: dict[str, Any] = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) + if Version(self.api.__version__) >= Version("0.7.1"): + # We are disabling nullable dtypes for fastparquet pending discussion + parquet_kwargs["pandas_nulls"] = False if use_nullable_dtypes: raise ValueError( "The 'use_nullable_dtypes' argument is not supported for the " "fastparquet engine" ) path = stringify_path(path) - parquet_kwargs = {} handles = None if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") @@ -337,6 +340,7 @@ def read( path, "rb", is_text=False, storage_options=storage_options ) path = handles.handle + parquet_file = self.api.ParquetFile(path, **parquet_kwargs) result = parquet_file.to_pandas(columns=columns, **kwargs) @@ -470,7 +474,8 @@ def read_parquet( use_nullable_dtypes : bool, default False If True, use dtypes that use ``pd.NA`` as missing value indicator - for the resulting DataFrame (only applicable for ``engine="pyarrow"``). + for the resulting DataFrame. (only applicable for the ``pyarrow`` + engine) As new dtypes are added that support ``pd.NA`` in the future, the output with this option will change to use those dtypes. Note: this is an experimental option, and behaviour (e.g. additional diff --git a/pandas/io/parsers/base_parser.py b/pandas/io/parsers/base_parser.py index 2a86ff13a2edc..f914e0601fb89 100644 --- a/pandas/io/parsers/base_parser.py +++ b/pandas/io/parsers/base_parser.py @@ -23,6 +23,7 @@ from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing from pandas._typing import ( + ArrayLike, DtypeArg, FilePathOrBuffer, final, @@ -803,6 +804,29 @@ def _do_date_conversions(self, names, data): return names, data + def _check_data_length(self, columns: list[str], data: list[ArrayLike]) -> None: + """Checks if length of data is equal to length of column names. + + One set of trailing commas is allowed. self.index_col not False + results in a ParserError previously when lengths do not match. + + Parameters + ---------- + columns: list of column names + data: list of array-likes containing the data column-wise. + """ + if not self.index_col and len(columns) != len(data) and columns: + if len(columns) == len(data) - 1 and np.all( + (is_object_dtype(data[-1]) and data[-1] == "") | isna(data[-1]) + ): + return + warnings.warn( + "Length of header or names does not match length of data. This leads " + "to a loss of data with index_col=False.", + ParserWarning, + stacklevel=6, + ) + def _evaluate_usecols(self, usecols, names): """ Check whether or not the 'usecols' parameter diff --git a/pandas/io/parsers/c_parser_wrapper.py b/pandas/io/parsers/c_parser_wrapper.py index 5c1f8f94a72da..ae62cc3b45578 100644 --- a/pandas/io/parsers/c_parser_wrapper.py +++ b/pandas/io/parsers/c_parser_wrapper.py @@ -300,6 +300,8 @@ def read(self, nrows=None): # columns as list alldata = [x[1] for x in data_tups] + if self.usecols is None: + self._check_data_length(names, alldata) data = {k: v for k, (i, v) in zip(names, data_tups)} @@ -363,7 +365,9 @@ def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict: numpy_dtypes, # type: ignore[arg-type] [], ) - if common_type == object: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", + # right operand type: "Type[object]") + if common_type == object: # type: ignore[comparison-overlap] warning_columns.append(str(name)) dtype = dtypes.pop() diff --git a/pandas/io/parsers/python_parser.py b/pandas/io/parsers/python_parser.py index 670868c6f4261..42e9db75847eb 100644 --- a/pandas/io/parsers/python_parser.py +++ b/pandas/io/parsers/python_parser.py @@ -4,6 +4,7 @@ abc, defaultdict, ) +from copy import copy import csv from io import StringIO import re @@ -25,6 +26,7 @@ ) from pandas.core.dtypes.common import is_integer +from pandas.core.dtypes.inference import is_dict_like from pandas.io.parsers.base_parser import ( ParserBase, @@ -80,7 +82,7 @@ def __init__(self, f: FilePathOrBuffer | list, **kwds): self.verbose = kwds["verbose"] self.converters = kwds["converters"] - self.dtype = kwds["dtype"] + self.dtype = copy(kwds["dtype"]) self.thousands = kwds["thousands"] self.decimal = kwds["decimal"] @@ -292,6 +294,8 @@ def _exclude_implicit_index(self, alldata): offset = len(self.index_col) # type: ignore[has-type] len_alldata = len(alldata) + self._check_data_length(names, alldata) + return { name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata }, names @@ -424,11 +428,11 @@ def _infer_columns(self): cur_count = counts[col] if ( self.dtype is not None + and is_dict_like(self.dtype) and self.dtype.get(old_col) is not None and self.dtype.get(col) is None ): self.dtype.update({col: self.dtype.get(old_col)}) - this_columns[i] = col counts[col] = cur_count + 1 elif have_mi_columns: diff --git a/pandas/io/parsers/readers.py b/pandas/io/parsers/readers.py index 06bdbe3054a15..c639a4a9d494e 100644 --- a/pandas/io/parsers/readers.py +++ b/pandas/io/parsers/readers.py @@ -1302,7 +1302,12 @@ def _refine_defaults_read( if delimiter and (sep is not lib.no_default): raise ValueError("Specified a sep and a delimiter; you can only specify one.") - if names is not lib.no_default and prefix is not lib.no_default: + if ( + names is not None + and names is not lib.no_default + and prefix is not None + and prefix is not lib.no_default + ): raise ValueError("Specified named and prefix; you can only specify one.") kwds["names"] = None if names is lib.no_default else names diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 208b8a008ffe6..1b4bd62ee7db7 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -5000,7 +5000,7 @@ def _maybe_convert_for_string_atom( # check for column in the values conflicts if existing_col is not None: eci = existing_col.validate_col(itemsize) - if eci > itemsize: + if eci is not None and eci > itemsize: itemsize = eci data_converted = data_converted.astype(f"|S{itemsize}", copy=False) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ffaebb3c10ae2..20e035891a625 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -853,15 +853,25 @@ def __eq__(self, other: Any) -> bool: @classmethod def get_base_missing_value(cls, dtype: np.dtype) -> int | float: - if dtype == np.int8: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + if dtype == np.int8: # type: ignore[comparison-overlap] value = cls.BASE_MISSING_VALUES["int8"] - elif dtype == np.int16: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + elif dtype == np.int16: # type: ignore[comparison-overlap] value = cls.BASE_MISSING_VALUES["int16"] - elif dtype == np.int32: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + elif dtype == np.int32: # type: ignore[comparison-overlap] value = cls.BASE_MISSING_VALUES["int32"] - elif dtype == np.float32: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[floating[Any]]") + elif dtype == np.float32: # type: ignore[comparison-overlap] value = cls.BASE_MISSING_VALUES["float32"] - elif dtype == np.float64: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[floating[Any]]") + elif dtype == np.float64: # type: ignore[comparison-overlap] value = cls.BASE_MISSING_VALUES["float64"] else: raise ValueError("Unsupported dtype") @@ -1663,12 +1673,7 @@ def read( if self.dtyplist[i] is not None: col = data.columns[i] dtype = data[col].dtype - # error: Value of type variable "_DTypeScalar" of "dtype" cannot be - # "object" - if ( - dtype != np.dtype(object) # type: ignore[type-var] - and dtype != self.dtyplist[i] - ): + if dtype != np.dtype(object) and dtype != self.dtyplist[i]: requires_type_conversion = True data_formatted.append( (col, Series(data[col], ix, self.dtyplist[i])) @@ -2033,15 +2038,25 @@ def _dtype_to_stata_type(dtype: np.dtype, column: Series) -> int: # do? itemsize = max_len_string_array(ensure_object(column._values)) return max(itemsize, 1) - elif dtype == np.float64: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[floating[Any]]") + elif dtype == np.float64: # type: ignore[comparison-overlap] return 255 - elif dtype == np.float32: + # Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[floating[Any]]") + elif dtype == np.float32: # type: ignore[comparison-overlap] return 254 - elif dtype == np.int32: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + elif dtype == np.int32: # type: ignore[comparison-overlap] return 253 - elif dtype == np.int16: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + elif dtype == np.int16: # type: ignore[comparison-overlap] return 252 - elif dtype == np.int8: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + elif dtype == np.int8: # type: ignore[comparison-overlap] return 251 else: # pragma : no cover raise NotImplementedError(f"Data type {dtype} not supported.") @@ -2761,15 +2776,25 @@ def _dtype_to_stata_type_117(dtype: np.dtype, column: Series, force_strl: bool) if itemsize <= 2045: return itemsize return 32768 - elif dtype == np.float64: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[floating[Any]]") + elif dtype == np.float64: # type: ignore[comparison-overlap] return 65526 - elif dtype == np.float32: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[floating[Any]]") + elif dtype == np.float32: # type: ignore[comparison-overlap] return 65527 - elif dtype == np.int32: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") [comparison-overlap] + elif dtype == np.int32: # type: ignore[comparison-overlap] return 65528 - elif dtype == np.int16: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + elif dtype == np.int16: # type: ignore[comparison-overlap] return 65529 - elif dtype == np.int8: + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[signedinteger[Any]]") + elif dtype == np.int8: # type: ignore[comparison-overlap] return 65530 else: # pragma : no cover raise NotImplementedError(f"Data type {dtype} not supported.") diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py index 00d87b707580d..302d5ede0ae86 100644 --- a/pandas/plotting/_core.py +++ b/pandas/plotting/_core.py @@ -7,8 +7,6 @@ Sequence, ) -import pkg_resources - from pandas._config import get_option from pandas._typing import IndexLabel @@ -1745,6 +1743,8 @@ def _load_backend(backend: str) -> types.ModuleType: types.ModuleType The imported backend. """ + import pkg_resources + if backend == "matplotlib": # Because matplotlib is an optional dependency and first-party backend, # we need to attempt an import here to raise an ImportError if needed. diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py index 9bfa24b6371ab..ebe336bd55c97 100644 --- a/pandas/plotting/_matplotlib/tools.py +++ b/pandas/plotting/_matplotlib/tools.py @@ -414,8 +414,12 @@ def handle_shared_axes( except IndexError: # if gridspec is used, ax.rowNum and ax.colNum may different # from layout shape. in this case, use last_row logic + if compat.mpl_ge_3_4_0(): + is_last_row = lambda x: x.get_subplotspec().is_last_row() + else: + is_last_row = lambda x: x.is_last_row() for ax in axarr: - if ax.is_last_row(): + if is_last_row(ax): continue if sharex or _has_externally_shared_axis(ax, "x"): _remove_labels_from_axis(ax.xaxis) diff --git a/pandas/tests/apply/test_frame_apply.py b/pandas/tests/apply/test_frame_apply.py index 2511f6fc2563c..c34447764b311 100644 --- a/pandas/tests/apply/test_frame_apply.py +++ b/pandas/tests/apply/test_frame_apply.py @@ -53,6 +53,17 @@ def test_apply_axis1_with_ea(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize( + "data, dtype", + [(1, None), (1, CategoricalDtype([1])), (Timestamp("2013-01-01", tz="UTC"), None)], +) +def test_agg_axis1_duplicate_index(data, dtype): + # GH 42380 + expected = DataFrame([[data], [data]], index=["a", "a"], dtype=dtype) + result = expected.agg(lambda x: x, axis=1) + tm.assert_frame_equal(result, expected) + + def test_apply_mixed_datetimelike(): # mixed datetimelike # GH 7778 @@ -660,13 +671,14 @@ def test_apply_dup_names_multi_agg(): tm.assert_frame_equal(result, expected) -def test_apply_nested_result_axis_1(): +@pytest.mark.parametrize("op", ["apply", "agg"]) +def test_apply_nested_result_axis_1(op): # GH 13820 def apply_list(row): return [2 * row["A"], 2 * row["C"], 2 * row["B"]] df = DataFrame(np.zeros((4, 4)), columns=list("ABCD")) - result = df.apply(apply_list, axis=1) + result = getattr(df, op)(apply_list, axis=1) expected = Series( [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]] ) @@ -1279,9 +1291,9 @@ def test_size_as_str(how, axis): # on the columns result = getattr(df, how)("size", axis=axis) if axis == 0 or axis == "index": - expected = Series(df.shape[0], index=df.columns, name="size") + expected = Series(df.shape[0], index=df.columns) else: - expected = Series(df.shape[1], index=df.index, name="size") + expected = Series(df.shape[1], index=df.index) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/apply/test_frame_transform.py b/pandas/tests/apply/test_frame_transform.py index 9050fab702881..0d3d4eecf92aa 100644 --- a/pandas/tests/apply/test_frame_transform.py +++ b/pandas/tests/apply/test_frame_transform.py @@ -39,8 +39,15 @@ def test_transform_ufunc(axis, float_frame, frame_or_series): @pytest.mark.parametrize("op", frame_transform_kernels) -def test_transform_groupby_kernel(axis, float_frame, op, request): +def test_transform_groupby_kernel(axis, float_frame, op, using_array_manager, request): # GH 35964 + if using_array_manager and op == "pct_change" and axis in (1, "columns"): + # TODO(ArrayManager) shift with axis=1 + request.node.add_marker( + pytest.mark.xfail( + reason="shift axis=1 not yet implemented for ArrayManager" + ) + ) args = [0.0] if op == "fillna" else [] if axis == 0 or axis == "index": diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index 5731f02430a9d..c6240600d3a05 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -2,9 +2,6 @@ This module tests the functionality of StringArray and ArrowStringArray. Tests for the str accessors are in pandas/tests/strings/test_string_array.py """ - -import re - import numpy as np import pytest @@ -314,7 +311,7 @@ def test_astype_int(dtype): tm.assert_numpy_array_equal(result, expected) arr = pd.array(["1", pd.NA, "3"], dtype=dtype) - msg = re.escape("int() argument must be a string, a bytes-like object or a number") + msg = r"int\(\) argument must be a string, a bytes-like object or a( real)? number" with pytest.raises(TypeError, match=msg): arr.astype("int64") diff --git a/pandas/tests/base/test_transpose.py b/pandas/tests/base/test_transpose.py index 5ba278368834c..246f33d27476c 100644 --- a/pandas/tests/base/test_transpose.py +++ b/pandas/tests/base/test_transpose.py @@ -1,6 +1,10 @@ import numpy as np import pytest +from pandas import ( + CategoricalDtype, + DataFrame, +) import pandas._testing as tm @@ -25,3 +29,28 @@ def test_numpy_transpose(index_or_series_obj): with pytest.raises(ValueError, match=msg): np.transpose(obj, axes=1) + + +@pytest.mark.parametrize( + "data, transposed_data, index, columns, dtype", + [ + ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], int), + ([[1], [2]], [[1, 2]], ["a", "a"], ["b"], CategoricalDtype([1, 2])), + ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], int), + ([[1, 2]], [[1], [2]], ["b"], ["a", "a"], CategoricalDtype([1, 2])), + ([[1, 2], [3, 4]], [[1, 3], [2, 4]], ["a", "a"], ["b", "b"], int), + ( + [[1, 2], [3, 4]], + [[1, 3], [2, 4]], + ["a", "a"], + ["b", "b"], + CategoricalDtype([1, 2, 3, 4]), + ), + ], +) +def test_duplicate_labels(data, transposed_data, index, columns, dtype): + # GH 42380 + df = DataFrame(data, index=index, columns=columns, dtype=dtype) + result = df.T + expected = DataFrame(transposed_data, index=columns, columns=index, dtype=dtype) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/dtypes/cast/test_dict_compat.py b/pandas/tests/dtypes/cast/test_dict_compat.py new file mode 100644 index 0000000000000..13dc82d779f95 --- /dev/null +++ b/pandas/tests/dtypes/cast/test_dict_compat.py @@ -0,0 +1,14 @@ +import numpy as np + +from pandas.core.dtypes.cast import dict_compat + +from pandas import Timestamp + + +def test_dict_compat(): + data_datetime64 = {np.datetime64("1990-03-15"): 1, np.datetime64("2015-03-15"): 2} + data_unchanged = {1: 2, 3: 4, 5: 6} + expected = {Timestamp("1990-3-15"): 1, Timestamp("2015-03-15"): 2} + assert dict_compat(data_datetime64) == expected + assert dict_compat(expected) == expected + assert dict_compat(data_unchanged) == data_unchanged diff --git a/pandas/tests/dtypes/test_inference.py b/pandas/tests/dtypes/test_inference.py index 3c798d82b9485..64a635707c2ff 100644 --- a/pandas/tests/dtypes/test_inference.py +++ b/pandas/tests/dtypes/test_inference.py @@ -150,6 +150,18 @@ def foo(): foo() +def test_is_list_like_iter_is_none(): + # GH 43373 + # is_list_like was yielding false positives with __iter__ == None + class NotListLike: + def __getitem__(self, item): + return self + + __iter__ = None + + assert not inference.is_list_like(NotListLike()) + + def test_is_sequence(): is_seq = inference.is_sequence assert is_seq((1, 2)) diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 96833a2e49fa1..ac181af7875b5 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -425,3 +425,23 @@ def test_item(self, data): with pytest.raises(ValueError, match=msg): s.item() + + def test_ellipsis_index(self): + # GH42430 1D slices over extension types turn into N-dimensional slices over + # ExtensionArrays + class CapturingStringArray(pd.arrays.StringArray): + """Extend StringArray to capture arguments to __getitem__""" + + def __getitem__(self, item): + self.last_item_arg = item + return super().__getitem__(item) + + df = pd.DataFrame( + {"col1": CapturingStringArray(np.array(["hello", "world"], dtype=object))} + ) + _ = df.iloc[:1] + + # String comparison because there's no native way to compare slices. + # Before the fix for GH42430, last_item_arg would get set to the 2D slice + # (Ellipsis, slice(None, 1, None)) + self.assert_equal(str(df["col1"].array.last_item_arg), "slice(None, 1, None)") diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 3e6b1cbfb311c..8f241679d5108 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -4,6 +4,7 @@ import pytest from pandas.core.dtypes.common import ( + is_datetime64tz_dtype, is_interval_dtype, is_period_dtype, ) @@ -328,6 +329,9 @@ def test_unstack(self, data, index, obj): ) if obj == "series": # TODO: special cases belong in dtype-specific tests + if is_datetime64tz_dtype(data.dtype): + assert expected.dtypes.apply(is_datetime64tz_dtype).all() + expected = expected.astype(object) if is_period_dtype(data.dtype): assert expected.dtypes.apply(is_period_dtype).all() expected = expected.astype(object) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 7a3f88d0d6c41..99d92a5bbf774 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -261,18 +261,7 @@ def test_dataframe_constructor_with_dtype(): tm.assert_frame_equal(result, expected) -@pytest.mark.parametrize( - "frame", - [ - pytest.param( - True, - marks=pytest.mark.xfail( - reason="pd.concat call inside NDFrame.astype reverts the dtype" - ), - ), - False, - ], -) +@pytest.mark.parametrize("frame", [True, False]) def test_astype_dispatches(frame): # This is a dtype-specific test that ensures Series[decimal].astype # gets all the way through to ExtensionArray.astype diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py index bb8347f0a0122..54e31e05e8b0e 100644 --- a/pandas/tests/extension/test_datetime.py +++ b/pandas/tests/extension/test_datetime.py @@ -193,40 +193,6 @@ def test_concat_mixed_dtypes(self, data): # drops the tz. super().test_concat_mixed_dtypes(data) - @pytest.mark.parametrize("obj", ["series", "frame"]) - def test_unstack(self, obj): - # GH-13287: can't use base test, since building the expected fails. - dtype = DatetimeTZDtype(tz="US/Central") - data = DatetimeArray._from_sequence( - ["2000", "2001", "2002", "2003"], - dtype=dtype, - ) - index = pd.MultiIndex.from_product(([["A", "B"], ["a", "b"]]), names=["a", "b"]) - - if obj == "series": - ser = pd.Series(data, index=index) - expected = pd.DataFrame( - {"A": data.take([0, 1]), "B": data.take([2, 3])}, - index=pd.Index(["a", "b"], name="b"), - ) - expected.columns.name = "a" - - else: - ser = pd.DataFrame({"A": data, "B": data}, index=index) - expected = pd.DataFrame( - { - ("A", "A"): data.take([0, 1]), - ("A", "B"): data.take([2, 3]), - ("B", "A"): data.take([0, 1]), - ("B", "B"): data.take([2, 3]), - }, - index=pd.Index(["a", "b"], name="b"), - ) - expected.columns.names = [None, "a"] - - result = ser.unstack(0) - self.assert_equal(result, expected) - class TestSetitem(BaseDatetimeTests, base.BaseSetitemTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index f0d3fb7ff9e1b..9c21f717573c1 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -13,6 +13,7 @@ be added to the array-specific tests in `pandas/tests/arrays/`. """ + import numpy as np import pytest diff --git a/pandas/tests/frame/constructors/test_from_records.py b/pandas/tests/frame/constructors/test_from_records.py index 35ad9f3e9693b..2322d5f995964 100644 --- a/pandas/tests/frame/constructors/test_from_records.py +++ b/pandas/tests/frame/constructors/test_from_records.py @@ -457,3 +457,16 @@ def test_from_records_empty_with_nonempty_fields_gh3682(self): b = a[:0] df2 = DataFrame.from_records(b, index="id") tm.assert_frame_equal(df2, df.iloc[:0]) + + def test_from_records_empty2(self): + # GH#42456 + dtype = [("prop", int)] + shape = (0, len(dtype)) + arr = np.empty(shape, dtype=dtype) + + result = DataFrame.from_records(arr) + expected = DataFrame({"prop": np.array([], dtype=int)}) + tm.assert_frame_equal(result, expected) + + alt = DataFrame(arr) + tm.assert_frame_equal(alt, expected) diff --git a/pandas/tests/frame/indexing/test_getitem.py b/pandas/tests/frame/indexing/test_getitem.py index 073e7b0357124..41cc47a91dbee 100644 --- a/pandas/tests/frame/indexing/test_getitem.py +++ b/pandas/tests/frame/indexing/test_getitem.py @@ -8,6 +8,7 @@ CategoricalDtype, CategoricalIndex, DataFrame, + DatetimeIndex, Index, MultiIndex, Series, @@ -363,3 +364,26 @@ def test_getitem_slice_float64(self, frame_or_series): result = obj.loc[start:end] tm.assert_equal(result, expected) + + def test_getitem_datetime_slice(self): + # GH#43223 + df = DataFrame( + {"a": 0}, + index=DatetimeIndex( + [ + "11.01.2011 22:00", + "11.01.2011 23:00", + "12.01.2011 00:00", + "2011-01-13 00:00", + ] + ), + ) + with tm.assert_produces_warning(FutureWarning): + result = df["2011-01-01":"2011-11-01"] + expected = DataFrame( + {"a": 0}, + index=DatetimeIndex( + ["11.01.2011 22:00", "11.01.2011 23:00", "2011-01-13 00:00"] + ), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py index e2121fa2318eb..6901f5a8be3e5 100644 --- a/pandas/tests/frame/indexing/test_indexing.py +++ b/pandas/tests/frame/indexing/test_indexing.py @@ -1206,6 +1206,13 @@ def test_getitem_interval_index_partial_indexing(self): res = df.loc[:, 0.5] tm.assert_series_equal(res, expected) + def test_setitem_array_as_cell_value(self): + # GH#43422 + df = DataFrame(columns=["a", "b"], dtype=object) + df.loc[0] = {"a": np.zeros((2,)), "b": np.zeros((2, 2))} + expected = DataFrame({"a": [np.zeros((2,))], "b": [np.zeros((2, 2))]}) + tm.assert_frame_equal(df, expected) + class TestDataFrameIndexingUInt64: def test_setitem(self, uint64_frame): diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py index 881f8db305240..775a5a38768e6 100644 --- a/pandas/tests/frame/methods/test_astype.py +++ b/pandas/tests/frame/methods/test_astype.py @@ -23,6 +23,7 @@ option_context, ) import pandas._testing as tm +from pandas.core.arrays.integer import coerce_to_array def _check_cast(df, v): @@ -632,13 +633,9 @@ def test_astype_tz_object_conversion(self, tz): result = result.astype({"tz": "datetime64[ns, Europe/London]"}) tm.assert_frame_equal(result, expected) - def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture, request): + def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture): + # GH#41409 tz = tz_naive_fixture - if tz is None: - mark = pytest.mark.xfail( - reason="GH#36153 uses ndarray formatting instead of DTA formatting" - ) - request.node.add_marker(mark) dti = date_range("2016-01-01", periods=3, tz=tz) dta = dti._data @@ -660,11 +657,40 @@ def test_astype_dt64_to_string(self, frame_or_series, tz_naive_fixture, request) alt = obj.astype(str) assert np.all(alt.iloc[1:] == result.iloc[1:]) + def test_astype_td64_to_string(self, frame_or_series): + # GH#41409 + tdi = pd.timedelta_range("1 Day", periods=3) + obj = frame_or_series(tdi) + + expected = frame_or_series(["1 days", "2 days", "3 days"], dtype="string") + result = obj.astype("string") + tm.assert_equal(result, expected) + def test_astype_bytes(self): # GH#39474 result = DataFrame(["foo", "bar", "baz"]).astype(bytes) assert result.dtypes[0] == np.dtype("S3") + @pytest.mark.parametrize( + "index_slice", + [ + np.s_[:2, :2], + np.s_[:1, :2], + np.s_[:2, :1], + np.s_[::2, ::2], + np.s_[::1, ::2], + np.s_[::2, ::1], + ], + ) + def test_astype_noncontiguous(self, index_slice): + # GH#42396 + data = np.arange(16).reshape(4, 4) + df = DataFrame(data) + + result = df.iloc[index_slice].astype("int16") + expected = df.iloc[index_slice] + tm.assert_frame_equal(result, expected, check_dtype=False) + class TestAstypeCategorical: def test_astype_from_categorical3(self): @@ -693,3 +719,40 @@ def test_categorical_astype_to_int(self, any_int_or_nullable_int_dtype): {"col1": pd.array([2, 1, 3], dtype=any_int_or_nullable_int_dtype)} ) tm.assert_frame_equal(df, expected) + + def test_astype_categorical_to_string_missing(self): + # https://github.com/pandas-dev/pandas/issues/41797 + df = DataFrame(["a", "b", np.nan]) + expected = df.astype(str) + cat = df.astype("category") + result = cat.astype(str) + tm.assert_frame_equal(result, expected) + + +class IntegerArrayNoCopy(pd.core.arrays.IntegerArray): + # GH 42501 + + @classmethod + def _from_sequence(cls, scalars, *, dtype=None, copy=False): + values, mask = coerce_to_array(scalars, dtype=dtype, copy=copy) + return IntegerArrayNoCopy(values, mask) + + def copy(self): + assert False + + +class Int16DtypeNoCopy(pd.Int16Dtype): + # GH 42501 + + @classmethod + def construct_array_type(cls): + return IntegerArrayNoCopy + + +def test_frame_astype_no_copy(): + # GH 42501 + df = DataFrame({"a": [1, 4, None, 5], "b": [6, 7, 8, 9]}, dtype=object) + result = df.astype({"a": Int16DtypeNoCopy()}, copy=False) + + assert result.a.dtype == pd.Int16Dtype() + assert np.shares_memory(df.b.values, result.b.values) diff --git a/pandas/tests/frame/methods/test_copy.py b/pandas/tests/frame/methods/test_copy.py index be52cf55fccb2..1c0b0755e7d94 100644 --- a/pandas/tests/frame/methods/test_copy.py +++ b/pandas/tests/frame/methods/test_copy.py @@ -1,5 +1,8 @@ +import numpy as np import pytest +import pandas.util._test_decorators as td + from pandas import DataFrame import pandas._testing as tm @@ -41,3 +44,20 @@ def test_copy(self, float_frame, float_string_frame): # copy objects copy = float_string_frame.copy() assert copy._mgr is not float_string_frame._mgr + + @td.skip_array_manager_invalid_test + def test_copy_consolidates(self): + # GH#42477 + df = DataFrame( + { + "a": np.random.randint(0, 100, size=55), + "b": np.random.randint(0, 100, size=55), + } + ) + + for i in range(0, 10): + df.loc[:, f"n_{i}"] = np.random.randint(0, 100, size=55) + + assert len(df._mgr.blocks) == 11 + result = df.copy() + assert len(result._mgr.blocks) == 1 diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py index 352d95156bf98..c259902ec2498 100644 --- a/pandas/tests/frame/methods/test_cov_corr.py +++ b/pandas/tests/frame/methods/test_cov_corr.py @@ -92,6 +92,7 @@ class TestDataFrameCorr: def test_corr_scipy_method(self, float_frame, method): float_frame["A"][:5] = np.nan float_frame["B"][5:10] = np.nan + float_frame["A"][:10] = float_frame["A"][10:20] correls = float_frame.corr(method=method) expected = float_frame["A"].corr(float_frame["C"], method=method) diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py index b3eeab9db4ad5..87139b3a1f8b9 100644 --- a/pandas/tests/frame/methods/test_drop.py +++ b/pandas/tests/frame/methods/test_drop.py @@ -456,6 +456,17 @@ def test_drop_with_non_unique_multiindex(self): expected = DataFrame([2], index=MultiIndex.from_arrays([["y"], ["j"]])) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("indexer", [("a", "a"), [("a", "a")]]) + def test_drop_tuple_with_non_unique_multiindex(self, indexer): + # GH#42771 + idx = MultiIndex.from_product([["a", "b"], ["a", "a"]]) + df = DataFrame({"x": range(len(idx))}, index=idx) + result = df.drop(index=[("a", "a")]) + expected = DataFrame( + {"x": [2, 3]}, index=MultiIndex.from_tuples([("b", "a"), ("b", "a")]) + ) + tm.assert_frame_equal(result, expected) + def test_drop_with_duplicate_columns(self): df = DataFrame( [[1, 5, 7.0], [1, 5, 7.0], [1, 5, 7.0]], columns=["bar", "a", "a"] diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py index bd0901387eeed..6fdf5d806ac6b 100644 --- a/pandas/tests/frame/methods/test_explode.py +++ b/pandas/tests/frame/methods/test_explode.py @@ -9,7 +9,12 @@ def test_error(): df = pd.DataFrame( {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} ) - with pytest.raises(ValueError, match="column must be a scalar"): + with pytest.raises( + ValueError, match="column must be a scalar, tuple, or list thereof" + ): + df.explode([list("AA")]) + + with pytest.raises(ValueError, match="column must be unique"): df.explode(list("AA")) df.columns = list("AA") @@ -17,6 +22,37 @@ def test_error(): df.explode("A") +@pytest.mark.parametrize( + "input_subset, error_message", + [ + ( + list("AC"), + "columns must have matching element counts", + ), + ( + [], + "column must be nonempty", + ), + ( + list("AC"), + "columns must have matching element counts", + ), + ], +) +def test_error_multi_columns(input_subset, error_message): + # GH 39240 + df = pd.DataFrame( + { + "A": [[0, 1, 2], np.nan, [], (3, 4)], + "B": 1, + "C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]], + }, + index=list("abcd"), + ) + with pytest.raises(ValueError, match=error_message): + df.explode(input_subset) + + def test_basic(): df = pd.DataFrame( {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1} @@ -180,3 +216,58 @@ def test_explode_sets(): result = df.explode(column="a").sort_values(by="a") expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1]) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "input_subset, expected_dict, expected_index", + [ + ( + list("AC"), + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4, np.nan], + index=list("aaabcdde"), + dtype=object, + ), + "B": 1, + "C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan], + }, + list("aaabcdde"), + ), + ( + list("A"), + { + "A": pd.Series( + [0, 1, 2, np.nan, np.nan, 3, 4, np.nan], + index=list("aaabcdde"), + dtype=object, + ), + "B": 1, + "C": [ + ["a", "b", "c"], + ["a", "b", "c"], + ["a", "b", "c"], + "foo", + [], + ["d", "e"], + ["d", "e"], + np.nan, + ], + }, + list("aaabcdde"), + ), + ], +) +def test_multi_columns(input_subset, expected_dict, expected_index): + # GH 39240 + df = pd.DataFrame( + { + "A": [[0, 1, 2], np.nan, [], (3, 4), np.nan], + "B": 1, + "C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan], + }, + index=list("abcde"), + ) + result = df.explode(input_subset) + expected = pd.DataFrame(expected_dict, expected_index) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/frame/methods/test_sample.py b/pandas/tests/frame/methods/test_sample.py index 55ef665c55241..604788ba91633 100644 --- a/pandas/tests/frame/methods/test_sample.py +++ b/pandas/tests/frame/methods/test_sample.py @@ -5,6 +5,7 @@ from pandas import ( DataFrame, + Index, Series, ) import pandas._testing as tm @@ -326,3 +327,12 @@ def test_sample_is_copy(self): with tm.assert_produces_warning(None): df2["d"] = 1 + + def test_sample_ignore_index(self): + # GH 38581 + df = DataFrame( + {"col1": range(10, 20), "col2": range(20, 30), "colString": ["a"] * 10} + ) + result = df.sample(3, ignore_index=True) + expected_index = Index([0, 1, 2]) + tm.assert_index_equal(result.index, expected_index) diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index 0474206aec06f..9df5f79aa7d19 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -183,6 +183,32 @@ def test_shift_axis1_multiple_blocks(self, using_array_manager): tm.assert_frame_equal(result, expected) + @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) axis=1 support + def test_shift_axis1_multiple_blocks_with_int_fill(self): + # GH#42719 + df1 = DataFrame(np.random.randint(1000, size=(5, 3))) + df2 = DataFrame(np.random.randint(1000, size=(5, 2))) + df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) + result = df3.shift(2, axis=1, fill_value=np.int_(0)) + assert len(df3._mgr.blocks) == 2 + + expected = df3.take([-1, -1, 0, 1], axis=1) + expected.iloc[:, :2] = np.int_(0) + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + + # Case with periods < 0 + df3 = pd.concat([df1.iloc[:4, 1:3], df2.iloc[:4, :]], axis=1) + result = df3.shift(-2, axis=1, fill_value=np.int_(0)) + assert len(df3._mgr.blocks) == 2 + + expected = df3.take([2, 3, -1, -1], axis=1) + expected.iloc[:, -2:] = np.int_(0) + expected.columns = df3.columns + + tm.assert_frame_equal(result, expected) + @pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning") def test_tshift(self, datetime_frame): # TODO: remove this test when tshift deprecation is enforced diff --git a/pandas/tests/frame/methods/test_to_records.py b/pandas/tests/frame/methods/test_to_records.py index ba8fe25401e8c..2c96cf291c154 100644 --- a/pandas/tests/frame/methods/test_to_records.py +++ b/pandas/tests/frame/methods/test_to_records.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas.compat import is_numpy_dev - from pandas import ( CategoricalDtype, DataFrame, @@ -173,28 +171,20 @@ def test_to_records_with_categorical(self): ), ), # Pass in a type instance. - pytest.param( + ( {"column_dtypes": str}, np.rec.array( [("0", "1", "0.2", "a"), ("1", "2", "1.5", "bc")], dtype=[("index", "{{ table_title|default("My Table") }} - {{ super() }} - {% endblock table %}""" + {% extends "html_table.tpl" %} + {% block table %} +

{{custom_title}}

+ {{ super() }} + {% endblock table %}""" ) ) - result = Styler.from_custom_template(str(tmpdir.join("templates")), "myhtml.tpl") + result = Styler.from_custom_template(str(tmpdir.join("tpl")), "myhtml_table.tpl") assert issubclass(result, Styler) assert result.env is not Styler.env - assert result.template_html is not Styler.template_html + assert result.template_html_table is not Styler.template_html_table styler = result(DataFrame({"A": [1, 2]})) - assert styler.render() + assert "

My Title

\n\n\n + {{ super() }} + {% endblock style %}""" + ) + ) + result = Styler.from_custom_template( + str(tmpdir.join("tpl")), html_style="myhtml_style.tpl" + ) + assert issubclass(result, Styler) + assert result.env is not Styler.env + assert result.template_html_style is not Styler.template_html_style + styler = result(DataFrame({"A": [1, 2]})) + assert '\n\nfull cap" in styler.render() + + +@pytest.mark.parametrize("index", [False, True]) +@pytest.mark.parametrize("columns", [False, True]) +@pytest.mark.parametrize("index_name", [True, False]) +def test_sticky_basic(styler, index, columns, index_name): + if index_name: + styler.index.name = "some text" + if index: + styler.set_sticky(axis=0) + if columns: + styler.set_sticky(axis=1) + + left_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " left: 0px;\n z-index: {1};\n}}" + ) + top_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " top: {1}px;\n z-index: {2};\n{3}}}" + ) + + res = styler.set_uuid("").to_html() + + # test index stickys over thead and tbody + assert (left_css.format("thead tr th:nth-child(1)", "3 !important") in res) is index + assert (left_css.format("tbody tr th:nth-child(1)", "1") in res) is index + + # test column stickys including if name row + assert ( + top_css.format("thead tr:nth-child(1) th", "0", "2", " height: 25px;\n") in res + ) is (columns and index_name) + assert ( + top_css.format("thead tr:nth-child(2) th", "25", "2", " height: 25px;\n") + in res + ) is (columns and index_name) + assert (top_css.format("thead tr:nth-child(1) th", "0", "2", "") in res) is ( + columns and not index_name + ) + + +@pytest.mark.parametrize("index", [False, True]) +@pytest.mark.parametrize("columns", [False, True]) +def test_sticky_mi(styler_mi, index, columns): + if index: + styler_mi.set_sticky(axis=0) + if columns: + styler_mi.set_sticky(axis=1) + + left_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " left: {1}px;\n min-width: 75px;\n max-width: 75px;\n z-index: {2};\n}}" + ) + top_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " top: {1}px;\n height: 25px;\n z-index: {2};\n}}" + ) + + res = styler_mi.set_uuid("").to_html() + + # test the index stickys for thead and tbody over both levels + assert ( + left_css.format("thead tr th:nth-child(1)", "0", "3 !important") in res + ) is index + assert (left_css.format("tbody tr th.level0", "0", "1") in res) is index + assert ( + left_css.format("thead tr th:nth-child(2)", "75", "3 !important") in res + ) is index + assert (left_css.format("tbody tr th.level1", "75", "1") in res) is index + + # test the column stickys for each level row + assert (top_css.format("thead tr:nth-child(1) th", "0", "2") in res) is columns + assert (top_css.format("thead tr:nth-child(2) th", "25", "2") in res) is columns + + +@pytest.mark.parametrize("index", [False, True]) +@pytest.mark.parametrize("columns", [False, True]) +def test_sticky_levels(styler_mi, index, columns): + if index: + styler_mi.set_sticky(axis=0, levels=[1]) + if columns: + styler_mi.set_sticky(axis=1, levels=[1]) + + left_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " left: {1}px;\n min-width: 75px;\n max-width: 75px;\n z-index: {2};\n}}" + ) + top_css = ( + "#T_ {0} {{\n position: sticky;\n background-color: white;\n" + " top: {1}px;\n height: 25px;\n z-index: {2};\n}}" + ) + + res = styler_mi.set_uuid("").to_html() + + # test no sticking of level0 + assert "#T_ thead tr th:nth-child(1)" not in res + assert "#T_ tbody tr th.level0" not in res + assert "#T_ thead tr:nth-child(1) th" not in res + + # test sticking level1 + assert ( + left_css.format("thead tr th:nth-child(2)", "0", "3 !important") in res + ) is index + assert (left_css.format("tbody tr th.level1", "0", "1") in res) is index + assert (top_css.format("thead tr:nth-child(2) th", "0", "2") in res) is columns + + +def test_sticky_raises(styler): + with pytest.raises(ValueError, match="`axis` must be"): + styler.set_sticky(axis="bad") diff --git a/pandas/tests/io/formats/style/test_style.py b/pandas/tests/io/formats/style/test_style.py index 281170ab6c7cb..99c725ed8df69 100644 --- a/pandas/tests/io/formats/style/test_style.py +++ b/pandas/tests/io/formats/style/test_style.py @@ -38,6 +38,35 @@ def mi_styler(mi_df): return Styler(mi_df, uuid_len=0) +@pytest.fixture +def mi_styler_comp(mi_styler): + # comprehensively add features to mi_styler + mi_styler.uuid_len = 5 + mi_styler.uuid = "abcde_" + mi_styler.set_caption("capt") + mi_styler.set_table_styles([{"selector": "a", "props": "a:v;"}]) + mi_styler.hide_columns() + mi_styler.hide_columns([("c0", "c1_a")]) + mi_styler.hide_index() + mi_styler.hide_index([("i0", "i1_a")]) + mi_styler.set_table_attributes('class="box"') + mi_styler.format(na_rep="MISSING", precision=3) + mi_styler.highlight_max(axis=None) + mi_styler.set_td_classes( + DataFrame( + [["a", "b"], ["a", "c"]], index=mi_styler.index, columns=mi_styler.columns + ) + ) + mi_styler.set_tooltips( + DataFrame( + [["a2", "b2"], ["a2", "c2"]], + index=mi_styler.index, + columns=mi_styler.columns, + ) + ) + return mi_styler + + @pytest.mark.parametrize( "sparse_columns, exp_cols", [ @@ -156,6 +185,49 @@ def test_render_trimming_mi(): assert {"attributes": 'colspan="2"'}.items() <= ctx["head"][0][2].items() +@pytest.mark.parametrize("comprehensive", [True, False]) +@pytest.mark.parametrize("render", [True, False]) +@pytest.mark.parametrize("deepcopy", [True, False]) +def test_copy(comprehensive, render, deepcopy, mi_styler, mi_styler_comp): + styler = mi_styler_comp if comprehensive else mi_styler + styler.uuid_len = 5 + + s2 = copy.deepcopy(styler) if deepcopy else copy.copy(styler) # make copy and check + assert s2 is not styler + + if render: + styler.to_html() + + excl = ["na_rep", "precision", "uuid", "cellstyle_map"] # deprecated or special var + if not deepcopy: # check memory locations are equal for all included attributes + for attr in [a for a in styler.__dict__ if (not callable(a) and a not in excl)]: + assert id(getattr(s2, attr)) == id(getattr(styler, attr)) + else: # check memory locations are different for nested or mutable vars + shallow = [ + "data", + "columns", + "index", + "uuid_len", + "caption", + "cell_ids", + "hide_index_", + "hide_columns_", + "table_attributes", + ] + for attr in shallow: + assert id(getattr(s2, attr)) == id(getattr(styler, attr)) + + for attr in [ + a + for a in styler.__dict__ + if (not callable(a) and a not in excl and a not in shallow) + ]: + if getattr(s2, attr) is None: + assert id(getattr(s2, attr)) == id(getattr(styler, attr)) + else: + assert id(getattr(s2, attr)) != id(getattr(styler, attr)) + + class TestStyler: def setup_method(self, method): np.random.seed(24) @@ -211,102 +283,6 @@ def test_update_ctx_flatten_multi_and_trailing_semi(self): } assert self.styler.ctx == expected - @pytest.mark.parametrize("do_changes", [True, False]) - @pytest.mark.parametrize("do_render", [True, False]) - def test_copy(self, do_changes, do_render): - # Updated in GH39708 - # Change some defaults (to check later if the new values are copied) - if do_changes: - self.styler.set_table_styles( - [{"selector": "th", "props": [("foo", "bar")]}] - ) - self.styler.set_table_attributes('class="foo" data-bar') - self.styler.hidden_index = not self.styler.hidden_index - self.styler.hide_columns("A") - classes = DataFrame( - [["favorite-val red", ""], [None, "blue my-val"]], - index=self.df.index, - columns=self.df.columns, - ) - self.styler.set_td_classes(classes) - ttips = DataFrame( - data=[["Favorite", ""], [np.nan, "my"]], - columns=self.df.columns, - index=self.df.index, - ) - self.styler.set_tooltips(ttips) - self.styler.cell_ids = not self.styler.cell_ids - - if do_render: - self.styler.render() - - s_copy = copy.copy(self.styler) - s_deepcopy = copy.deepcopy(self.styler) - - assert self.styler is not s_copy - assert self.styler is not s_deepcopy - - # Check for identity - assert self.styler.ctx is s_copy.ctx - assert self.styler._todo is s_copy._todo - assert self.styler.table_styles is s_copy.table_styles - assert self.styler.hidden_columns is s_copy.hidden_columns - assert self.styler.cell_context is s_copy.cell_context - assert self.styler.tooltips is s_copy.tooltips - if do_changes: # self.styler.tooltips is not None - assert self.styler.tooltips.tt_data is s_copy.tooltips.tt_data - assert ( - self.styler.tooltips.class_properties - is s_copy.tooltips.class_properties - ) - assert self.styler.tooltips.table_styles is s_copy.tooltips.table_styles - - # Check for non-identity - assert self.styler.ctx is not s_deepcopy.ctx - assert self.styler._todo is not s_deepcopy._todo - assert self.styler.hidden_columns is not s_deepcopy.hidden_columns - assert self.styler.cell_context is not s_deepcopy.cell_context - if do_changes: # self.styler.table_style is not None - assert self.styler.table_styles is not s_deepcopy.table_styles - if do_changes: # self.styler.tooltips is not None - assert self.styler.tooltips is not s_deepcopy.tooltips - assert self.styler.tooltips.tt_data is not s_deepcopy.tooltips.tt_data - assert ( - self.styler.tooltips.class_properties - is not s_deepcopy.tooltips.class_properties - ) - assert ( - self.styler.tooltips.table_styles - is not s_deepcopy.tooltips.table_styles - ) - - self.styler._update_ctx(self.attrs) - self.styler.highlight_max() - assert self.styler.ctx == s_copy.ctx - assert self.styler.ctx != s_deepcopy.ctx - assert self.styler._todo == s_copy._todo - assert self.styler._todo != s_deepcopy._todo - assert s_deepcopy._todo == [] - - equal_attributes = [ - "table_styles", - "table_attributes", - "cell_ids", - "hidden_index", - "hidden_columns", - "cell_context", - ] - for s2 in [s_copy, s_deepcopy]: - for att in equal_attributes: - assert self.styler.__dict__[att] == s2.__dict__[att] - if do_changes: # self.styler.tooltips is not None - tm.assert_frame_equal(self.styler.tooltips.tt_data, s2.tooltips.tt_data) - assert ( - self.styler.tooltips.class_properties - == s2.tooltips.class_properties - ) - assert self.styler.tooltips.table_styles == s2.tooltips.table_styles - def test_clear(self): # updated in GH 39396 tt = DataFrame({"A": [None, "tt"]}) @@ -317,7 +293,7 @@ def test_clear(self): assert len(s._todo) > 0 assert s.tooltips assert len(s.cell_context) > 0 - assert s.hidden_index is True + assert s.hide_index_ is True assert len(s.hidden_columns) > 0 s = s._compute() @@ -331,7 +307,7 @@ def test_clear(self): assert len(s._todo) == 0 assert not s.tooltips assert len(s.cell_context) == 0 - assert s.hidden_index is False + assert s.hide_index_ is False assert len(s.hidden_columns) == 0 def test_render(self): @@ -1127,6 +1103,14 @@ def test_mi_sparse_column_names(self): ] assert head == expected + def test_hide_column_headers(self): + ctx = self.styler.hide_columns()._translate(True, True) + assert len(ctx["head"]) == 0 # no header entries with an unnamed index + + self.df.index.name = "some_name" + ctx = self.df.style.hide_columns()._translate(True, True) + assert len(ctx["head"]) == 0 # no header for index names, changed in #42101 + def test_hide_single_index(self): # GH 14194 # single unnamed index @@ -1195,7 +1179,7 @@ def test_hide_columns_single_level(self): assert not ctx["body"][0][1]["is_visible"] # col A, row 1 assert not ctx["body"][1][2]["is_visible"] # col B, row 1 - def test_hide_columns_mult_levels(self): + def test_hide_columns_index_mult_levels(self): # GH 14194 # setup dataframe with multiple column levels and indices i1 = MultiIndex.from_arrays( @@ -1227,7 +1211,8 @@ def test_hide_columns_mult_levels(self): # hide first column only ctx = df.style.hide_columns([("b", 0)])._translate(True, True) - assert ctx["head"][0][2]["is_visible"] # b + assert not ctx["head"][0][2]["is_visible"] # b + assert ctx["head"][0][3]["is_visible"] # b assert not ctx["head"][1][2]["is_visible"] # 0 assert not ctx["body"][1][2]["is_visible"] # 3 assert ctx["body"][1][3]["is_visible"] @@ -1243,6 +1228,18 @@ def test_hide_columns_mult_levels(self): assert ctx["body"][1][2]["is_visible"] assert ctx["body"][1][2]["display_value"] == 3 + # hide top row level, which hides both rows + ctx = df.style.hide_index("a")._translate(True, True) + for i in [0, 1, 2, 3]: + assert not ctx["body"][0][i]["is_visible"] + assert not ctx["body"][1][i]["is_visible"] + + # hide first row only + ctx = df.style.hide_index(("a", 0))._translate(True, True) + for i in [0, 1, 2, 3]: + assert not ctx["body"][0][i]["is_visible"] + assert ctx["body"][1][i]["is_visible"] + def test_pipe(self): def set_caption_from_template(styler, a, b): return styler.set_caption(f"Dataframe with a = {a} and b = {b}") diff --git a/pandas/tests/io/formats/style/test_to_latex.py b/pandas/tests/io/formats/style/test_to_latex.py index 97347bddaa187..55b17dc37adda 100644 --- a/pandas/tests/io/formats/style/test_to_latex.py +++ b/pandas/tests/io/formats/style/test_to_latex.py @@ -12,6 +12,7 @@ from pandas.io.formats.style import Styler from pandas.io.formats.style_render import ( _parse_latex_cell_styles, + _parse_latex_css_conversion, _parse_latex_header_span, _parse_latex_table_styles, _parse_latex_table_wrapping, @@ -443,3 +444,64 @@ def test_parse_latex_table_wrapping(styler): def test_short_caption(styler): result = styler.to_latex(caption=("full cap", "short cap")) assert "\\caption[short cap]{full cap}" in result + + +@pytest.mark.parametrize( + "css, expected", + [ + ([("color", "red")], [("color", "{red}")]), # test color and input format types + ( + [("color", "rgb(128, 128, 128 )")], + [("color", "[rgb]{0.502, 0.502, 0.502}")], + ), + ( + [("color", "rgb(128, 50%, 25% )")], + [("color", "[rgb]{0.502, 0.500, 0.250}")], + ), + ( + [("color", "rgba(128,128,128,1)")], + [("color", "[rgb]{0.502, 0.502, 0.502}")], + ), + ([("color", "#FF00FF")], [("color", "[HTML]{FF00FF}")]), + ([("color", "#F0F")], [("color", "[HTML]{FF00FF}")]), + ([("font-weight", "bold")], [("bfseries", "")]), # test font-weight and types + ([("font-weight", "bolder")], [("bfseries", "")]), + ([("font-weight", "normal")], []), + ([("background-color", "red")], [("cellcolor", "{red}--lwrap")]), + ( + [("background-color", "#FF00FF")], # test background-color command and wrap + [("cellcolor", "[HTML]{FF00FF}--lwrap")], + ), + ([("font-style", "italic")], [("itshape", "")]), # test font-style and types + ([("font-style", "oblique")], [("slshape", "")]), + ([("font-style", "normal")], []), + ([("color", "red /*--dwrap*/")], [("color", "{red}--dwrap")]), # css comments + ([("background-color", "red /* --dwrap */")], [("cellcolor", "{red}--dwrap")]), + ], +) +def test_parse_latex_css_conversion(css, expected): + result = _parse_latex_css_conversion(css) + assert result == expected + + +def test_parse_latex_css_conversion_option(): + css = [("command", "option--latex--wrap")] + expected = [("command", "option--wrap")] + result = _parse_latex_css_conversion(css) + assert result == expected + + +def test_styler_object_after_render(styler): + # GH 42320 + pre_render = styler._copy(deepcopy=True) + styler.to_latex( + column_format="rllr", + position="h", + position_float="centering", + hrules=True, + label="my lab", + caption="my cap", + ) + + assert pre_render.table_styles == styler.table_styles + assert pre_render.caption == styler.caption diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0ffc6044a5897..2d418fcbcc395 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -28,8 +28,6 @@ ) import pandas._testing as tm -pytestmark = pytest.mark.skipif(PY310, reason="timeout with coverage") - _seriesd = tm.getSeriesData() _frame = DataFrame(_seriesd) @@ -1181,6 +1179,7 @@ def test_sparse(self): expected = s.to_json() assert expected == ss.to_json() + @pytest.mark.skipif(PY310, reason="segfault GH 42130") @pytest.mark.parametrize( "ts", [ @@ -1198,6 +1197,7 @@ def test_tz_is_utc(self, ts): dt = ts.to_pydatetime() assert dumps(dt, iso_dates=True) == exp + @pytest.mark.skipif(PY310, reason="segfault GH 42130") @pytest.mark.parametrize( "tz_range", [ diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py index 805f6b8dbe461..57a6b214cec84 100644 --- a/pandas/tests/io/json/test_ujson.py +++ b/pandas/tests/io/json/test_ujson.py @@ -16,6 +16,7 @@ import pandas._libs.json as ujson from pandas.compat import ( IS64, + PY310, is_platform_windows, ) @@ -248,7 +249,21 @@ def test_double_precision(self): assert rounded_input == json.loads(output) assert rounded_input == ujson.decode(output) - @pytest.mark.parametrize("invalid_val", [20, -1, "9", None]) + @pytest.mark.parametrize( + "invalid_val", + [ + 20, + -1, + pytest.param( + "9", + marks=pytest.mark.xfail(PY310, reason="Failing on Python 3.10 GH41940"), + ), + pytest.param( + None, + marks=pytest.mark.xfail(PY310, reason="Failing on Python 3.10 GH41940"), + ), + ], + ) def test_invalid_double_precision(self, invalid_val): double_input = 30.12345678901234567890 expected_exception = ValueError if isinstance(invalid_val, int) else TypeError diff --git a/pandas/tests/io/parser/common/__init__.py b/pandas/tests/io/parser/common/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/io/parser/common/test_chunksize.py b/pandas/tests/io/parser/common/test_chunksize.py index ceb770ce72b78..86891367e9bd6 100644 --- a/pandas/tests/io/parser/common/test_chunksize.py +++ b/pandas/tests/io/parser/common/test_chunksize.py @@ -143,10 +143,7 @@ def test_read_chunksize_jagged_names(all_parsers): parser = all_parsers data = "\n".join(["0"] * 7 + [",".join(["0"] * 10)]) - # error: List item 0 has incompatible type "float"; expected "int" - expected = DataFrame( - [[0] + [np.nan] * 9] * 7 + [[0] * 10] # type: ignore[list-item] - ) + expected = DataFrame([[0] + [np.nan] * 9] * 7 + [[0] * 10]) with parser.read_csv(StringIO(data), names=range(10), chunksize=4) as reader: result = concat(reader) tm.assert_frame_equal(result, expected) @@ -180,13 +177,17 @@ def test_chunks_have_consistent_numerical_type(all_parsers): def test_warn_if_chunks_have_mismatched_type(all_parsers, request): warning_type = None parser = all_parsers - integers = [str(i) for i in range(499999)] - data = "a\n" + "\n".join(integers + ["a", "b"] + integers) + size = 10000 # see gh-3866: if chunks are different types and can't # be coerced using numerical types, then issue warning. if parser.engine == "c" and parser.low_memory: warning_type = DtypeWarning + # Use larger size to hit warning path + size = 499999 + + integers = [str(i) for i in range(size)] + data = "a\n" + "\n".join(integers + ["a", "b"] + integers) buf = StringIO(data) diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 8fa2d7f7b8d65..b2e528aa5f8d5 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -15,6 +15,7 @@ from pandas.errors import ( EmptyDataError, ParserError, + ParserWarning, ) from pandas import ( @@ -685,7 +686,8 @@ def test_no_header_two_extra_columns(all_parsers): ref = DataFrame([["foo", "bar", "baz"]], columns=column_names) stream = StringIO("foo,bar,baz,bam,blah") parser = all_parsers - df = parser.read_csv(stream, header=None, names=column_names, index_col=False) + with tm.assert_produces_warning(ParserWarning): + df = parser.read_csv(stream, header=None, names=column_names, index_col=False) tm.assert_frame_equal(df, ref) @@ -762,15 +764,24 @@ def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter): @pytest.mark.parametrize("func", ["read_csv", "read_table"]) -@pytest.mark.parametrize("prefix", [None, "x"]) -@pytest.mark.parametrize("names", [None, ["a"]]) -def test_names_and_prefix_not_lib_no_default(all_parsers, names, prefix, func): +def test_names_and_prefix_not_None_raises(all_parsers, func): # GH#39123 f = StringIO("a,b\n1,2") parser = all_parsers msg = "Specified named and prefix; you can only specify one." with pytest.raises(ValueError, match=msg): - getattr(parser, func)(f, names=names, prefix=prefix) + getattr(parser, func)(f, names=["a", "b"], prefix="x") + + +@pytest.mark.parametrize("func", ["read_csv", "read_table"]) +@pytest.mark.parametrize("prefix, names", [(None, ["x0", "x1"]), ("x", None)]) +def test_names_and_prefix_explicit_None(all_parsers, names, prefix, func): + # GH42387 + f = StringIO("a,b\n1,2") + expected = DataFrame({"x0": ["a", "1"], "x1": ["b", "2"]}) + parser = all_parsers + result = getattr(parser, func)(f, names=names, sep=",", prefix=prefix, header=None) + tm.assert_frame_equal(result, expected) def test_dict_keys_as_names(all_parsers): diff --git a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py index 59fd3de60e0bf..0b59e877cd9a1 100644 --- a/pandas/tests/io/parser/dtypes/test_dtypes_basic.py +++ b/pandas/tests/io/parser/dtypes/test_dtypes_basic.py @@ -245,6 +245,19 @@ def test_dtype_mangle_dup_cols(all_parsers, dtypes, exp_value): # GH#35211 parser = all_parsers data = """a,a\n1,1""" - result = parser.read_csv(StringIO(data), dtype={"a": str, **dtypes}) + dtype_dict = {"a": str, **dtypes} + # GH#42462 + dtype_dict_copy = dtype_dict.copy() + result = parser.read_csv(StringIO(data), dtype=dtype_dict) expected = DataFrame({"a": ["1"], "a.1": [exp_value]}) + assert dtype_dict == dtype_dict_copy, "dtype dict changed" + tm.assert_frame_equal(result, expected) + + +def test_dtype_mangle_dup_cols_single_dtype(all_parsers): + # GH#42022 + parser = all_parsers + data = """a,a\n1,1""" + result = parser.read_csv(StringIO(data), dtype=str) + expected = DataFrame({"a": ["1"], "a.1": ["1"]}) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py index 006438df2a5e0..9cf2f068eb318 100644 --- a/pandas/tests/io/parser/test_encoding.py +++ b/pandas/tests/io/parser/test_encoding.py @@ -237,3 +237,16 @@ def test_encoding_memory_map(all_parsers, encoding): expected.to_csv(file, index=False, encoding=encoding) df = parser.read_csv(file, encoding=encoding, memory_map=True) tm.assert_frame_equal(df, expected) + + +def test_not_readable(all_parsers): + # GH43439 + parser = all_parsers + if parser.engine in ("python", "pyarrow"): + pytest.skip("SpooledTemporaryFile does only work with the c-engine") + with tempfile.SpooledTemporaryFile() as handle: + handle.write(b"abcd") + handle.seek(0) + df = parser.read_csv(handle) + expected = DataFrame([], columns=["abcd"]) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/parser/usecols/test_usecols_basic.py b/pandas/tests/io/parser/usecols/test_usecols_basic.py index b86dc5ef85fc6..16649be5b8a58 100644 --- a/pandas/tests/io/parser/usecols/test_usecols_basic.py +++ b/pandas/tests/io/parser/usecols/test_usecols_basic.py @@ -383,7 +383,9 @@ def test_usecols_indices_out_of_bounds(all_parsers, names): a,b 1,2 """ - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, raise_on_extra_warnings=False + ): result = parser.read_csv(StringIO(data), usecols=[0, 2], names=names, header=0) expected = DataFrame({"a": [1], "b": [None]}) if names is None and parser.engine == "python": diff --git a/pandas/tests/io/pytables/test_append.py b/pandas/tests/io/pytables/test_append.py index 719b54a57a6c7..b5f9e6e74ece9 100644 --- a/pandas/tests/io/pytables/test_append.py +++ b/pandas/tests/io/pytables/test_append.py @@ -778,6 +778,22 @@ def test_append_raise(setup_path): with pytest.raises(ValueError, match=msg): store.append("df", df) + # incompatible type (GH 41897) + _maybe_remove(store, "df") + df["foo"] = Timestamp("20130101") + store.append("df", df) + df["foo"] = "bar" + msg = re.escape( + "invalid combination of [values_axes] on appending data " + "[name->values_block_1,cname->values_block_1," + "dtype->bytes24,kind->string,shape->(1, 30)] " + "vs current table " + "[name->values_block_1,cname->values_block_1," + "dtype->datetime64,kind->datetime64,shape->None]" + ) + with pytest.raises(ValueError, match=msg): + store.append("df", df) + def test_append_with_timedelta(setup_path): # GH 3577 diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d100c584b698a..914f68b90ad37 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -575,6 +575,47 @@ def test_write_column_index_nonstring(self, pa): msg = r"parquet must have string column names" self.check_error_on_write(df, engine, ValueError, msg) + def test_use_nullable_dtypes(self, engine): + import pyarrow.parquet as pq + + if engine == "fastparquet": + # We are manually disabling fastparquet's + # nullable dtype support pending discussion + pytest.skip("Fastparquet nullable dtype support is disabled") + + table = pyarrow.table( + { + "a": pyarrow.array([1, 2, 3, None], "int64"), + "b": pyarrow.array([1, 2, 3, None], "uint8"), + "c": pyarrow.array(["a", "b", "c", None]), + "d": pyarrow.array([True, False, True, None]), + # Test that nullable dtypes used even in absence of nulls + "e": pyarrow.array([1, 2, 3, 4], "int64"), + } + ) + with tm.ensure_clean() as path: + # write manually with pyarrow to write integers + pq.write_table(table, path) + result1 = read_parquet(path, engine=engine) + result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True) + + assert result1["a"].dtype == np.dtype("float64") + expected = pd.DataFrame( + { + "a": pd.array([1, 2, 3, None], dtype="Int64"), + "b": pd.array([1, 2, 3, None], dtype="UInt8"), + "c": pd.array(["a", "b", "c", None], dtype="string"), + "d": pd.array([True, False, True, None], dtype="boolean"), + "e": pd.array([1, 2, 3, 4], dtype="Int64"), + } + ) + if engine == "fastparquet": + # Fastparquet doesn't support string columns yet + # Only int and boolean + result2 = result2.drop("c", axis=1) + expected = expected.drop("c", axis=1) + tm.assert_frame_equal(result2, expected) + @pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning") class TestParquetPyArrow(Base): @@ -829,35 +870,6 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) - @td.skip_if_no("pyarrow") - def test_use_nullable_dtypes(self, pa): - import pyarrow.parquet as pq - - table = pyarrow.table( - { - "a": pyarrow.array([1, 2, 3, None], "int64"), - "b": pyarrow.array([1, 2, 3, None], "uint8"), - "c": pyarrow.array(["a", "b", "c", None]), - "d": pyarrow.array([True, False, True, None]), - } - ) - with tm.ensure_clean() as path: - # write manually with pyarrow to write integers - pq.write_table(table, path) - result1 = read_parquet(path) - result2 = read_parquet(path, use_nullable_dtypes=True) - - assert result1["a"].dtype == np.dtype("float64") - expected = pd.DataFrame( - { - "a": pd.array([1, 2, 3, None], dtype="Int64"), - "b": pd.array([1, 2, 3, None], dtype="UInt8"), - "c": pd.array(["a", "b", "c", None], dtype="string"), - "d": pd.array([True, False, True, None], dtype="boolean"), - } - ) - tm.assert_frame_equal(result2, expected) - def test_timestamp_nanoseconds(self, pa): # with version 2.0, pyarrow defaults to writing the nanoseconds, so # this should work without error @@ -928,7 +940,9 @@ def test_duplicate_columns(self, fp): def test_bool_with_none(self, fp): df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") - check_round_trip(df, fp, expected=expected) + # Fastparquet bug in 0.7.1 makes it so that this dtype becomes + # float64 + check_round_trip(df, fp, expected=expected, check_dtype=False) def test_unsupported(self, fp): @@ -1049,7 +1063,7 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list): expected.index.name = "index" check_round_trip(df, fp, expected=expected) - def test_use_nullable_dtypes_not_supported(self, fp): + def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp): df = pd.DataFrame({"a": [1, 2]}) with tm.ensure_clean() as path: diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 7cf9d7e9a1925..c4423e6cc4ead 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -163,9 +163,9 @@ def compare_index_period(result, expected, typ, version): tm.assert_index_equal(result.shift(2), expected.shift(2)) -files = glob.glob( - os.path.join(os.path.dirname(__file__), "data", "legacy_pickle", "*", "*.pickle") -) +here = os.path.dirname(__file__) +legacy_dirname = os.path.join(here, "data", "legacy_pickle") +files = glob.glob(os.path.join(legacy_dirname, "*", "*.pickle")) @pytest.fixture(params=files) @@ -635,3 +635,13 @@ def test_pickle_big_dataframe_compression(protocol, compression): partial(pd.read_pickle, compression=compression), ) tm.assert_frame_equal(df, result) + + +def test_pickle_frame_v124_unpickle_130(): + # GH#42345 DataFrame created in 1.2.x, unpickle in 1.3.x + path = os.path.join(legacy_dirname, "1.2.4", "empty_frame_v1_2_4-GH#42345.pkl") + with open(path, "rb") as fd: + df = pickle.load(fd) + + expected = pd.DataFrame() + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/io/xml/test_to_xml.py b/pandas/tests/io/xml/test_to_xml.py index 1e2973075f98e..bdee797f3340c 100644 --- a/pandas/tests/io/xml/test_to_xml.py +++ b/pandas/tests/io/xml/test_to_xml.py @@ -12,7 +12,10 @@ from pandas.compat import PY38 import pandas.util._test_decorators as td -from pandas import DataFrame +from pandas import ( + DataFrame, + Index, +) import pandas._testing as tm from pandas.io.common import get_handle @@ -291,6 +294,45 @@ def test_index_false_rename_row_root(datapath, parser): assert output == expected +@pytest.mark.parametrize( + "offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]] +) +def test_index_false_with_offset_input_index(parser, offset_index): + """ + Tests that the output does not contain the `` field when the index of the + input Dataframe has an offset. + + This is a regression test for issue #42458. + """ + + expected = """\ + + + + square + 360 + 4.0 + + + circle + 360 + + + + triangle + 180 + 3.0 + +""" + + offset_geom_df = geom_df.copy() + offset_geom_df.index = Index(offset_index) + output = offset_geom_df.to_xml(index=False, parser=parser) + output = equalize_decl(output) + + assert output == expected + + # NA_REP na_expected = """\ diff --git a/pandas/tests/libs/test_hashtable.py b/pandas/tests/libs/test_hashtable.py index aeff591e3f0dc..08bfc74e0ef8d 100644 --- a/pandas/tests/libs/test_hashtable.py +++ b/pandas/tests/libs/test_hashtable.py @@ -8,6 +8,7 @@ import pandas as pd import pandas._testing as tm +from pandas.core.algorithms import isin @contextmanager @@ -178,6 +179,74 @@ def test_no_reallocation(self, table_type, dtype): assert n_buckets_start == clean_table.get_state()["n_buckets"] +class TestPyObjectHashTableWithNans: + def test_nan_float(self): + nan1 = float("nan") + nan2 = float("nan") + assert nan1 is not nan2 + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + + def test_nan_complex_both(self): + nan1 = complex(float("nan"), float("nan")) + nan2 = complex(float("nan"), float("nan")) + assert nan1 is not nan2 + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + + def test_nan_complex_real(self): + nan1 = complex(float("nan"), 1) + nan2 = complex(float("nan"), 1) + other = complex(float("nan"), 2) + assert nan1 is not nan2 + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + with pytest.raises(KeyError, match=None) as error: + table.get_item(other) + assert str(error.value) == str(other) + + def test_nan_complex_imag(self): + nan1 = complex(1, float("nan")) + nan2 = complex(1, float("nan")) + other = complex(2, float("nan")) + assert nan1 is not nan2 + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + with pytest.raises(KeyError, match=None) as error: + table.get_item(other) + assert str(error.value) == str(other) + + def test_nan_in_tuple(self): + nan1 = (float("nan"),) + nan2 = (float("nan"),) + assert nan1[0] is not nan2[0] + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + + def test_nan_in_nested_tuple(self): + nan1 = (1, (2, (float("nan"),))) + nan2 = (1, (2, (float("nan"),))) + other = (1, 2) + table = ht.PyObjectHashTable() + table.set_item(nan1, 42) + assert table.get_item(nan2) == 42 + with pytest.raises(KeyError, match=None) as error: + table.get_item(other) + assert str(error.value) == str(other) + + +def test_hash_equal_tuple_with_nans(): + a = (float("nan"), (float("nan"), float("nan"))) + b = (float("nan"), (float("nan"), float("nan"))) + assert ht.object_hash(a) == ht.object_hash(b) + assert ht.objects_are_equal(a, b) + + def test_get_labels_groupby_for_Int64(writable): table = ht.Int64HashTable() vals = np.array([1, 2, -1, 2, 1, -1], dtype=np.int64) @@ -277,6 +346,29 @@ def test_unique(self, table_type, dtype): assert np.all(np.isnan(unique)) and len(unique) == 1 +def test_unique_for_nan_objects_floats(): + table = ht.PyObjectHashTable() + keys = np.array([float("nan") for i in range(50)], dtype=np.object_) + unique = table.unique(keys) + assert len(unique) == 1 + + +def test_unique_for_nan_objects_complex(): + table = ht.PyObjectHashTable() + keys = np.array([complex(float("nan"), 1.0) for i in range(50)], dtype=np.object_) + unique = table.unique(keys) + assert len(unique) == 1 + + +def test_unique_for_nan_objects_tuple(): + table = ht.PyObjectHashTable() + keys = np.array( + [1] + [(1.0, (float("nan"), 1.0)) for i in range(50)], dtype=np.object_ + ) + unique = table.unique(keys) + assert len(unique) == 2 + + def get_ht_function(fun_name, type_suffix): return getattr(ht, fun_name) @@ -426,3 +518,20 @@ def test_mode(self, dtype, type_suffix): values = np.array([42, np.nan, np.nan, np.nan], dtype=dtype) assert mode(values, True) == 42 assert np.isnan(mode(values, False)) + + +def test_ismember_tuple_with_nans(): + # GH-41836 + values = [("a", float("nan")), ("b", 1)] + comps = [("a", float("nan"))] + result = isin(values, comps) + expected = np.array([True, False], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) + + +def test_float_complex_int_are_equal_as_objects(): + values = ["a", 5, 5.0, 5.0 + 0j] + comps = list(range(129)) + result = isin(values, comps) + expected = np.array([False, True, True, True], dtype=np.bool_) + tm.assert_numpy_array_equal(result, expected) diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py index 2f698a82bac49..c0c1c2f057c96 100644 --- a/pandas/tests/reductions/test_reductions.py +++ b/pandas/tests/reductions/test_reductions.py @@ -1480,3 +1480,10 @@ def test_mode_sortwarning(self): result = result.sort_values().reset_index(drop=True) tm.assert_series_equal(result, expected) + + def test_mode_boolean_with_na(self): + # GH#42107 + ser = Series([True, False, True, pd.NA], dtype="boolean") + result = ser.mode() + expected = Series({0: True}, dtype="boolean") + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py index 3e78d6ebf4c0c..204efa0e5670f 100644 --- a/pandas/tests/resample/test_resampler_grouper.py +++ b/pandas/tests/resample/test_resampler_grouper.py @@ -9,6 +9,7 @@ import pandas as pd from pandas import ( DataFrame, + Index, Series, TimedeltaIndex, Timestamp, @@ -405,6 +406,20 @@ def test_resample_groupby_agg(): tm.assert_frame_equal(result, expected) +def test_resample_groupby_agg_listlike(): + # GH 42905 + ts = Timestamp("2021-02-28 00:00:00") + df = DataFrame({"class": ["beta"], "value": [69]}, index=Index([ts], name="date")) + resampled = df.groupby("class").resample("M")["value"] + result = resampled.agg(["sum", "size"]) + expected = DataFrame( + [[69, 1]], + index=pd.MultiIndex.from_tuples([("beta", ts)], names=["class", "date"]), + columns=["sum", "size"], + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("keys", [["a"], ["a", "b"]]) def test_empty(keys): # GH 26411 diff --git a/pandas/tests/reshape/concat/test_dataframe.py b/pandas/tests/reshape/concat/test_dataframe.py index 3636139c19eef..91c246fc9ee2d 100644 --- a/pandas/tests/reshape/concat/test_dataframe.py +++ b/pandas/tests/reshape/concat/test_dataframe.py @@ -170,3 +170,13 @@ def test_concat_dataframe_keys_bug(self, sort): # it works result = concat([t1, t2], axis=1, keys=["t1", "t2"], sort=sort) assert list(result.columns) == [("t1", "value"), ("t2", "value")] + + def test_concat_bool_with_int(self): + # GH#42092 we may want to change this to return object, but that + # would need a deprecation + df1 = DataFrame(Series([True, False, True, True], dtype="bool")) + df2 = DataFrame(Series([1, 0, 1], dtype="int64")) + + result = concat([df1, df2]) + expected = concat([df1.astype("int64"), df2]) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index cd07b3814d023..52a0fd9ed81ca 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -354,8 +354,8 @@ def test_merge_join_key_dtype_cast(self): df = merge(df1, df2, how="outer") # GH13169 - # this really should be bool - assert df["key"].dtype == "object" + # GH#40073 + assert df["key"].dtype == "bool" df1 = DataFrame({"val": [1]}) df2 = DataFrame({"val": [2]}) @@ -366,10 +366,12 @@ def test_merge_join_key_dtype_cast(self): def test_handle_join_key_pass_array(self): left = DataFrame( - {"key": [1, 1, 2, 2, 3], "value": np.arange(5)}, columns=["value", "key"] + {"key": [1, 1, 2, 2, 3], "value": np.arange(5)}, + columns=["value", "key"], + dtype="int64", ) - right = DataFrame({"rvalue": np.arange(6)}) - key = np.array([1, 1, 2, 3, 4, 5]) + right = DataFrame({"rvalue": np.arange(6)}, dtype="int64") + key = np.array([1, 1, 2, 3, 4, 5], dtype="int64") merged = merge(left, right, left_on="key", right_on=key, how="outer") merged2 = merge(right, left, left_on=key, right_on="key", how="outer") @@ -1642,6 +1644,59 @@ def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals): with pytest.raises(ValueError, match=msg): merge(df2, df1, on=["A"]) + @pytest.mark.parametrize( + "expected_data, how", + [ + ([1, 2], "outer"), + ([], "inner"), + ([2], "right"), + ([1], "left"), + ], + ) + def test_merge_EA_dtype(self, any_nullable_numeric_dtype, how, expected_data): + # GH#40073 + d1 = DataFrame([(1,)], columns=["id"], dtype=any_nullable_numeric_dtype) + d2 = DataFrame([(2,)], columns=["id"], dtype=any_nullable_numeric_dtype) + result = merge(d1, d2, how=how) + expected = DataFrame( + expected_data, columns=["id"], dtype=any_nullable_numeric_dtype + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "expected_data, how", + [ + (["a", "b"], "outer"), + ([], "inner"), + (["b"], "right"), + (["a"], "left"), + ], + ) + def test_merge_string_dtype(self, how, expected_data, any_string_dtype): + # GH#40073 + d1 = DataFrame([("a",)], columns=["id"], dtype=any_string_dtype) + d2 = DataFrame([("b",)], columns=["id"], dtype=any_string_dtype) + result = merge(d1, d2, how=how) + expected = DataFrame(expected_data, columns=["id"], dtype=any_string_dtype) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "how, expected_data", + [ + ("inner", [[True, 1, 4], [False, 5, 3]]), + ("outer", [[True, 1, 4], [False, 5, 3]]), + ("left", [[True, 1, 4], [False, 5, 3]]), + ("right", [[False, 5, 3], [True, 1, 4]]), + ], + ) + def test_merge_bool_dtype(self, how, expected_data): + # GH#40073 + df1 = DataFrame({"A": [True, False], "B": [1, 5]}) + df2 = DataFrame({"A": [False, True], "C": [3, 4]}) + result = merge(df1, df2, how=how) + expected = DataFrame(expected_data, columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + @pytest.fixture def left(): diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py index 6746158179964..310cf2debadc6 100644 --- a/pandas/tests/reshape/merge/test_merge_asof.py +++ b/pandas/tests/reshape/merge/test_merge_asof.py @@ -1437,3 +1437,50 @@ def test_merge_asof_index_behavior(kwargs): index=index, ) tm.assert_frame_equal(result, expected) + + +def test_merge_asof_numeri_column_in_index(): + # GH#34488 + left = pd.DataFrame({"b": [10, 11, 12]}, index=Index([1, 2, 3], name="a")) + right = pd.DataFrame({"c": [20, 21, 22]}, index=Index([0, 2, 3], name="a")) + + result = merge_asof(left, right, left_on="a", right_on="a") + expected = pd.DataFrame({"a": [1, 2, 3], "b": [10, 11, 12], "c": [20, 21, 22]}) + tm.assert_frame_equal(result, expected) + + +def test_merge_asof_numeri_column_in_multiindex(): + # GH#34488 + left = pd.DataFrame( + {"b": [10, 11, 12]}, + index=pd.MultiIndex.from_arrays([[1, 2, 3], ["a", "b", "c"]], names=["a", "z"]), + ) + right = pd.DataFrame( + {"c": [20, 21, 22]}, + index=pd.MultiIndex.from_arrays([[1, 2, 3], ["x", "y", "z"]], names=["a", "y"]), + ) + + result = merge_asof(left, right, left_on="a", right_on="a") + expected = pd.DataFrame({"a": [1, 2, 3], "b": [10, 11, 12], "c": [20, 21, 22]}) + tm.assert_frame_equal(result, expected) + + +def test_merge_asof_numeri_column_in_index_object_dtype(): + # GH#34488 + left = pd.DataFrame({"b": [10, 11, 12]}, index=Index(["1", "2", "3"], name="a")) + right = pd.DataFrame({"c": [20, 21, 22]}, index=Index(["m", "n", "o"], name="a")) + + with pytest.raises( + MergeError, + match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", + ): + merge_asof(left, right, left_on="a", right_on="a") + + left = left.reset_index().set_index(["a", "b"]) + right = right.reset_index().set_index(["a", "c"]) + + with pytest.raises( + MergeError, + match=r"Incompatible merge dtype, .*, both sides must have numeric dtype", + ): + merge_asof(left, right, left_on="a", right_on="a") diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py index a950c648838ff..4972cb34aac69 100644 --- a/pandas/tests/reshape/test_melt.py +++ b/pandas/tests/reshape/test_melt.py @@ -403,6 +403,15 @@ def test_ignore_index_name_and_type(self): tm.assert_frame_equal(result, expected) + def test_melt_with_duplicate_columns(self): + # GH#41951 + df = DataFrame([["id", 2, 3]], columns=["a", "b", "b"]) + result = df.melt(id_vars=["a"], value_vars=["b"]) + expected = DataFrame( + [["id", "b", 2], ["id", "b", 3]], columns=["a", "variable", "value"] + ) + tm.assert_frame_equal(result, expected) + class TestLreshape: def test_pairs(self): diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py index 9f6cdbb81bd89..7dfda0463ecaf 100644 --- a/pandas/tests/scalar/timedelta/test_arithmetic.py +++ b/pandas/tests/scalar/timedelta/test_arithmetic.py @@ -10,7 +10,6 @@ import numpy as np import pytest -from pandas.compat import is_numpy_dev from pandas.errors import OutOfBoundsTimedelta import pandas as pd @@ -18,7 +17,6 @@ NaT, Timedelta, Timestamp, - compat, offsets, ) import pandas._testing as tm @@ -434,15 +432,7 @@ def test_td_div_numeric_scalar(self): "nan", [ np.nan, - pytest.param( - np.float64("NaN"), - marks=pytest.mark.xfail( - # Works on numpy dev only in python 3.9 - is_numpy_dev and not compat.PY39, - raises=RuntimeWarning, - reason="https://github.com/pandas-dev/pandas/issues/31992", - ), - ), + np.float64("NaN"), float("nan"), ], ) diff --git a/pandas/tests/scalar/timedelta/test_timedelta.py b/pandas/tests/scalar/timedelta/test_timedelta.py index 8b42bca8b8a0c..4aa2f62fe85a0 100644 --- a/pandas/tests/scalar/timedelta/test_timedelta.py +++ b/pandas/tests/scalar/timedelta/test_timedelta.py @@ -4,6 +4,7 @@ import numpy as np import pytest +from pandas._libs import lib from pandas._libs.tslibs import ( NaT, iNaT, @@ -391,8 +392,7 @@ def test_round_implementation_bounds(self): "method", [Timedelta.round, Timedelta.floor, Timedelta.ceil] ) def test_round_sanity(self, method, n, request): - iinfo = np.iinfo(np.int64) - val = np.random.randint(iinfo.min + 1, iinfo.max, dtype=np.int64) + val = np.random.randint(iNaT + 1, lib.i8max, dtype=np.int64) td = Timedelta(val) assert method(td, "ns") == td @@ -552,8 +552,8 @@ def test_implementation_limits(self): # GH 12727 # timedelta limits correspond to int64 boundaries - assert min_td.value == np.iinfo(np.int64).min + 1 - assert max_td.value == np.iinfo(np.int64).max + assert min_td.value == iNaT + 1 + assert max_td.value == lib.i8max # Beyond lower limit, a NAT before the Overflow assert (min_td - Timedelta(1, "ns")) is NaT diff --git a/pandas/tests/scalar/timestamp/test_comparisons.py b/pandas/tests/scalar/timestamp/test_comparisons.py index 555067f2aba1a..ee36223eb2496 100644 --- a/pandas/tests/scalar/timestamp/test_comparisons.py +++ b/pandas/tests/scalar/timestamp/test_comparisons.py @@ -266,6 +266,19 @@ def test_timestamp_compare_oob_dt64(self): assert Timestamp.max < other + us # Note: numpy gets the reversed comparison wrong + # GH-42794 + other = datetime(9999, 9, 9) + assert Timestamp.min < other + assert other > Timestamp.min + assert Timestamp.max < other + assert other > Timestamp.max + + other = datetime(1, 1, 1) + assert Timestamp.max > other + assert other < Timestamp.max + assert Timestamp.min > other + assert other < Timestamp.min + def test_compare_zerodim_array(self): # GH#26916 ts = Timestamp.now() diff --git a/pandas/tests/scalar/timestamp/test_timestamp.py b/pandas/tests/scalar/timestamp/test_timestamp.py index e13242e60e3a3..f2010b33538fb 100644 --- a/pandas/tests/scalar/timestamp/test_timestamp.py +++ b/pandas/tests/scalar/timestamp/test_timestamp.py @@ -6,6 +6,7 @@ timedelta, ) import locale +import pickle import unicodedata from dateutil.tz import tzutc @@ -440,6 +441,17 @@ def test_tz_conversion_freq(self, tz_naive_fixture): t2 = Timestamp("2019-01-02 12:00", tz="UTC", freq="T") assert t2.tz_convert(tz="UTC").freq == t2.freq + def test_pickle_freq_no_warning(self): + # GH#41949 we don't want a warning on unpickling + with tm.assert_produces_warning(FutureWarning, match="freq"): + ts = Timestamp("2019-01-01 10:00", freq="H") + + out = pickle.dumps(ts) + with tm.assert_produces_warning(None): + res = pickle.loads(out) + + assert res._freq == ts._freq + class TestTimestampNsOperations: def test_nanosecond_string_parsing(self): diff --git a/pandas/tests/scalar/timestamp/test_unary_ops.py b/pandas/tests/scalar/timestamp/test_unary_ops.py index aab0b2e6d31ef..366c0f7cf2f74 100644 --- a/pandas/tests/scalar/timestamp/test_unary_ops.py +++ b/pandas/tests/scalar/timestamp/test_unary_ops.py @@ -6,11 +6,13 @@ import pytz from pytz import utc +from pandas._libs import lib from pandas._libs.tslibs import ( NaT, Timedelta, Timestamp, conversion, + iNaT, to_offset, ) from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG @@ -279,8 +281,7 @@ def test_round_implementation_bounds(self): "method", [Timestamp.round, Timestamp.floor, Timestamp.ceil] ) def test_round_sanity(self, method, n): - iinfo = np.iinfo(np.int64) - val = np.random.randint(iinfo.min + 1, iinfo.max, dtype=np.int64) + val = np.random.randint(iNaT + 1, lib.i8max, dtype=np.int64) ts = Timestamp(val) def checker(res, ts, nanos): diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index 13054062defb4..2ad3aee99d147 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -8,6 +8,7 @@ from pandas import ( Categorical, + DataFrame, DatetimeIndex, Index, MultiIndex, @@ -906,3 +907,17 @@ def val(self): def is_inplace(self, obj): # This is specific to the 4 cases currently implemented for this class. return obj.dtype.kind != "i" + + +def test_setitem_with_bool_indexer(): + # GH#42530 + + df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + result = df.pop("b") + result[[True, False, False]] = 9 + expected = Series(data=[9, 5, 6], name="b") + tm.assert_series_equal(result, expected) + + df.loc[[True, False, False], "a"] = 10 + expected = DataFrame({"a": [10, 2, 3]}) + tm.assert_frame_equal(df, expected) diff --git a/pandas/tests/series/methods/test_between.py b/pandas/tests/series/methods/test_between.py index 381c733619c6b..9c11b71e4bee6 100644 --- a/pandas/tests/series/methods/test_between.py +++ b/pandas/tests/series/methods/test_between.py @@ -1,4 +1,5 @@ import numpy as np +import pytest from pandas import ( Series, @@ -28,7 +29,7 @@ def test_between_datetime_values(self): expected = ser[3:18].dropna() tm.assert_series_equal(result, expected) - result = ser[ser.between(ser[3], ser[17], inclusive=False)] + result = ser[ser.between(ser[3], ser[17], inclusive="neither")] expected = ser[5:16].dropna() tm.assert_series_equal(result, expected) @@ -38,3 +39,48 @@ def test_between_period_values(self): result = ser.between(left, right) expected = (ser >= left) & (ser <= right) tm.assert_series_equal(result, expected) + + def test_between_inclusive_string(self): # :issue:`40628` + series = Series(date_range("1/1/2000", periods=10)) + left, right = series[[2, 7]] + + result = series.between(left, right, inclusive="both") + expected = (series >= left) & (series <= right) + tm.assert_series_equal(result, expected) + + result = series.between(left, right, inclusive="left") + expected = (series >= left) & (series < right) + tm.assert_series_equal(result, expected) + + result = series.between(left, right, inclusive="right") + expected = (series > left) & (series <= right) + tm.assert_series_equal(result, expected) + + result = series.between(left, right, inclusive="neither") + expected = (series > left) & (series < right) + tm.assert_series_equal(result, expected) + + def test_between_error_args(self): # :issue:`40628` + series = Series(date_range("1/1/2000", periods=10)) + left, right = series[[2, 7]] + + value_error_msg = ( + "Inclusive has to be either string of 'both'," + "'left', 'right', or 'neither'." + ) + + with pytest.raises(ValueError, match=value_error_msg): + series = Series(date_range("1/1/2000", periods=10)) + series.between(left, right, inclusive="yes") + + def test_between_inclusive_warning(self): + series = Series(date_range("1/1/2000", periods=10)) + left, right = series[[2, 7]] + with tm.assert_produces_warning(FutureWarning): + result = series.between(left, right, inclusive=False) + expected = (series > left) & (series < right) + tm.assert_series_equal(result, expected) + with tm.assert_produces_warning(FutureWarning): + result = series.between(left, right, inclusive=True) + expected = (series >= left) & (series <= right) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_clip.py b/pandas/tests/series/methods/test_clip.py index e4803a9cd3038..620f529b522ae 100644 --- a/pandas/tests/series/methods/test_clip.py +++ b/pandas/tests/series/methods/test_clip.py @@ -1,3 +1,5 @@ +from datetime import datetime + import numpy as np import pytest @@ -128,6 +130,15 @@ def test_clip_with_datetimes(self): ) tm.assert_series_equal(result, expected) + def test_clip_with_timestamps_and_oob_datetimes(self): + # GH-42794 + ser = Series([datetime(1, 1, 1), datetime(9999, 9, 9)]) + + result = ser.clip(lower=Timestamp.min, upper=Timestamp.max) + expected = Series([Timestamp.min, Timestamp.max], dtype="object") + + tm.assert_series_equal(result, expected) + def test_clip_pos_args_deprecation(self): # https://github.com/pandas-dev/pandas/issues/41485 ser = Series([1, 2, 3]) diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py index 1aec2a5e5d726..bb87861f6e629 100644 --- a/pandas/tests/series/methods/test_fillna.py +++ b/pandas/tests/series/methods/test_fillna.py @@ -702,6 +702,23 @@ def test_fillna_categorical_raises(self): with pytest.raises(TypeError, match=msg): ser.fillna(DataFrame({1: ["a"], 3: ["b"]})) + @pytest.mark.parametrize("dtype", [float, "float32", "float64"]) + @pytest.mark.parametrize("fill_type", tm.ALL_REAL_DTYPES) + def test_fillna_float_casting(self, dtype, fill_type): + # GH-43424 + ser = Series([np.nan, 1.2], dtype=dtype) + fill_values = Series([2, 2], dtype=fill_type) + result = ser.fillna(fill_values) + expected = Series([2.0, 1.2], dtype=dtype) + tm.assert_series_equal(result, expected) + + def test_fillna_f32_upcast_with_dict(self): + # GH-43424 + ser = Series([np.nan, 1.2], dtype=np.float32) + result = ser.fillna({0: 1}) + expected = Series([1.0, 1.2], dtype=np.float32) + tm.assert_series_equal(result, expected) + # --------------------------------------------------------------- # Invalid Usages diff --git a/pandas/tests/series/methods/test_isin.py b/pandas/tests/series/methods/test_isin.py index 898a769dfac48..d3a3434872826 100644 --- a/pandas/tests/series/methods/test_isin.py +++ b/pandas/tests/series/methods/test_isin.py @@ -156,6 +156,27 @@ def test_isin_float_in_int_series(self, values): expected = Series([True, False]) tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("dtype", ["boolean", "Int64", "Float64"]) + @pytest.mark.parametrize( + "data,values,expected", + [ + ([0, 1, 0], [1], [False, True, False]), + ([0, 1, 0], [1, pd.NA], [False, True, False]), + ([0, pd.NA, 0], [1, 0], [True, False, True]), + ([0, 1, pd.NA], [1, pd.NA], [False, True, True]), + ([0, 1, pd.NA], [1, np.nan], [False, True, False]), + ([0, pd.NA, pd.NA], [np.nan, pd.NaT, None], [False, False, False]), + ], + ) + def test_isin_masked_types(self, dtype, data, values, expected): + # GH#42405 + ser = Series(data, dtype=dtype) + + result = ser.isin(values) + expected = Series(expected, dtype="boolean") + + tm.assert_series_equal(result, expected) + @pytest.mark.slow def test_isin_large_series_mixed_dtypes_and_nan(): diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index 3af06145b9fcd..0efb0663a0327 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -211,3 +211,19 @@ def test_nlargest_boolean(self, data, expected): result = ser.nlargest(1) expected = Series(expected) tm.assert_series_equal(result, expected) + + def test_nlargest_nullable(self, any_nullable_numeric_dtype): + # GH#42816 + dtype = any_nullable_numeric_dtype + arr = np.random.randn(10).astype(dtype.lower(), copy=False) + + ser = Series(arr.copy(), dtype=dtype) + ser[1] = pd.NA + result = ser.nlargest(5) + + expected = ( + Series(np.delete(arr, 1), index=ser.index.delete(1)) + .nlargest(5) + .astype(dtype) + ) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/methods/test_quantile.py b/pandas/tests/series/methods/test_quantile.py index 461c81bc3b44f..84bfe8524634b 100644 --- a/pandas/tests/series/methods/test_quantile.py +++ b/pandas/tests/series/methods/test_quantile.py @@ -217,3 +217,9 @@ def test_quantile_empty(self): res = s.quantile([0.5]) exp = Series([pd.NaT], index=[0.5]) tm.assert_series_equal(res, exp) + + @pytest.mark.parametrize("dtype", [int, float, "Int64"]) + def test_quantile_dtypes(self, dtype): + result = Series([1, 2, 3], dtype=dtype).quantile(np.arange(0, 1, 0.25)) + expected = Series(np.arange(1, 3, 0.5), index=np.arange(0, 1, 0.25)) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py index eddf57c1e88f3..b49c209a59a06 100644 --- a/pandas/tests/series/test_api.py +++ b/pandas/tests/series/test_api.py @@ -101,7 +101,7 @@ def test_index_tab_completion(self, index): def test_not_hashable(self): s_empty = Series(dtype=object) s = Series([1]) - msg = "'Series' objects are mutable, thus they cannot be hashed" + msg = "unhashable type: 'Series'" with pytest.raises(TypeError, match=msg): hash(s_empty) with pytest.raises(TypeError, match=msg): diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 93c95b3004876..12664e4463343 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -165,3 +165,28 @@ def test_non_bool_array_with_na(self): # in particular, this should not raise arr = np.array(["A", "B", np.nan], dtype=object) assert not com.is_bool_indexer(arr) + + def test_list_subclass(self): + # GH#42433 + + class MyList(list): + pass + + val = MyList(["a"]) + + assert not com.is_bool_indexer(val) + + val = MyList([True]) + assert com.is_bool_indexer(val) + + def test_frozenlist(self): + # GH#42461 + data = {"col1": [1, 2], "col2": [3, 4]} + df = pd.DataFrame(data=data) + + frozen = df.index.names[1:] + assert not com.is_bool_indexer(frozen) + + result = df[frozen] + expected = df[[]] + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py index ea95f90d3a2cb..e34ca9e7f9e27 100644 --- a/pandas/tests/test_downstream.py +++ b/pandas/tests/test_downstream.py @@ -69,6 +69,21 @@ def test_oo_optimizable(): subprocess.check_call([sys.executable, "-OO", "-c", "import pandas"]) +def test_oo_optimized_datetime_index_unpickle(): + # GH 42866 + subprocess.check_call( + [ + sys.executable, + "-OO", + "-c", + ( + "import pandas as pd, pickle; " + "pickle.loads(pickle.dumps(pd.date_range('2021-01-01', periods=1)))" + ), + ] + ) + + @tm.network # Cython import warning @pytest.mark.filterwarnings("ignore:pandas.util.testing is deprecated") diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py index 121ca99785831..9da7951c199ca 100644 --- a/pandas/tests/tools/test_to_datetime.py +++ b/pandas/tests/tools/test_to_datetime.py @@ -957,18 +957,40 @@ def test_to_datetime_cache_scalar(self): expected = Timestamp("20130101 00:00:00") assert result == expected - def test_convert_object_to_datetime_with_cache(self): + @pytest.mark.parametrize( + "datetimelikes,expected_values", + ( + ( + (None, np.nan) + (NaT,) * start_caching_at, + (NaT,) * (start_caching_at + 2), + ), + ( + (None, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, + (NaT, Timestamp("2012-07-26")) + (NaT,) * start_caching_at, + ), + ( + (None,) + + (NaT,) * start_caching_at + + ("2012 July 26", Timestamp("2012-07-26")), + (NaT,) * (start_caching_at + 1) + + (Timestamp("2012-07-26"), Timestamp("2012-07-26")), + ), + ), + ) + def test_convert_object_to_datetime_with_cache( + self, datetimelikes, expected_values + ): # GH#39882 ser = Series( - [None] + [NaT] * start_caching_at + [Timestamp("2012-07-26")], + datetimelikes, dtype="object", ) - result = to_datetime(ser, errors="coerce") - expected = Series( - [NaT] * (start_caching_at + 1) + [Timestamp("2012-07-26")], + result_series = to_datetime(ser, errors="coerce") + expected_series = Series( + expected_values, dtype="datetime64[ns]", ) - tm.assert_series_equal(result, expected) + tm.assert_series_equal(result_series, expected_series) @pytest.mark.parametrize( "date, format", diff --git a/pandas/tests/util/test_hashing.py b/pandas/tests/util/test_hashing.py index 8ce24dc963dc5..e4a46de11ceb7 100644 --- a/pandas/tests/util/test_hashing.py +++ b/pandas/tests/util/test_hashing.py @@ -255,6 +255,32 @@ def test_hash_keys(): assert (a != b).all() +def test_df_hash_keys(): + # DataFrame version of the test_hash_keys. + # https://github.com/pandas-dev/pandas/issues/41404 + obj = DataFrame({"x": np.arange(3), "y": list("abc")}) + + a = hash_pandas_object(obj, hash_key="9876543210123456") + b = hash_pandas_object(obj, hash_key="9876543210123465") + + assert (a != b).all() + + +def test_df_encoding(): + # Check that DataFrame recognizes optional encoding. + # https://github.com/pandas-dev/pandas/issues/41404 + # https://github.com/pandas-dev/pandas/pull/42049 + obj = DataFrame({"x": np.arange(3), "y": list("a+c")}) + + a = hash_pandas_object(obj, encoding="utf8") + b = hash_pandas_object(obj, encoding="utf7") + + # Note that the "+" is encoded as "+-" in utf-7. + assert a[0] == b[0] + assert a[1] != b[1] + assert a[2] == b[2] + + def test_invalid_key(): # This only matters for object dtypes. msg = "key should be a 16-byte string encoded" diff --git a/pandas/tests/window/conftest.py b/pandas/tests/window/conftest.py index 24b28356a3099..5382f5f9202c0 100644 --- a/pandas/tests/window/conftest.py +++ b/pandas/tests/window/conftest.py @@ -90,13 +90,17 @@ def parallel(request): return request.param -@pytest.fixture(params=[True, False]) +# Can parameterize nogil & nopython over True | False, but limiting per +# https://github.com/pandas-dev/pandas/pull/41971#issuecomment-860607472 + + +@pytest.fixture(params=[False]) def nogil(request): """nogil keyword argument for numba.jit""" return request.param -@pytest.fixture(params=[True, False]) +@pytest.fixture(params=[True]) def nopython(request): """nopython keyword argument for numba.jit""" return request.param diff --git a/pandas/tests/window/test_online.py b/pandas/tests/window/test_online.py index c7580650926da..461c62c07326d 100644 --- a/pandas/tests/window/test_online.py +++ b/pandas/tests/window/test_online.py @@ -22,6 +22,7 @@ def test_invalid_update(self): ): online_ewm.mean(update=df.head(1)) + @pytest.mark.slow @pytest.mark.parametrize( "obj", [DataFrame({"a": range(5), "b": range(5)}), Series(range(5), name="foo")] ) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 17a6d9216ca92..5bf2df0208ddc 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -6,7 +6,10 @@ import numpy as np import pytest -from pandas.compat import is_platform_arm +from pandas.compat import ( + is_platform_arm, + is_platform_mac, +) from pandas.errors import UnsupportedFunctionCall from pandas import ( @@ -217,6 +220,36 @@ def test_datetimelike_centered_selections( tm.assert_frame_equal(result, expected, check_dtype=False) +@pytest.mark.parametrize( + "window,closed,expected", + [ + ("3s", "right", [3.0, 3.0, 3.0]), + ("3s", "both", [3.0, 3.0, 3.0]), + ("3s", "left", [3.0, 3.0, 3.0]), + ("3s", "neither", [3.0, 3.0, 3.0]), + ("2s", "right", [3.0, 2.0, 2.0]), + ("2s", "both", [3.0, 3.0, 3.0]), + ("2s", "left", [1.0, 3.0, 3.0]), + ("2s", "neither", [1.0, 2.0, 2.0]), + ], +) +def test_datetimelike_centered_offset_covers_all( + window, closed, expected, frame_or_series +): + # GH 42753 + + index = [ + Timestamp("20130101 09:00:01"), + Timestamp("20130101 09:00:02"), + Timestamp("20130101 09:00:02"), + ] + df = frame_or_series([1, 1, 1], index=index) + + result = df.rolling(window, closed=closed, center=True).sum() + expected = frame_or_series(expected, index=index) + tm.assert_equal(result, expected) + + def test_even_number_window_alignment(): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3)) @@ -1073,7 +1106,7 @@ def test_rolling_sem(frame_or_series): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(is_platform_arm(), reason="GH 41740") +@pytest.mark.xfail(is_platform_arm() and not is_platform_mac(), reason="GH 38921") @pytest.mark.parametrize( ("func", "third_value", "values"), [ diff --git a/pyproject.toml b/pyproject.toml index 3947856d94d01..0d1aefb9d4b55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,12 +2,20 @@ # Minimum requirements for the build system to execute. # See https://github.com/scipy/scipy/pull/12940 for the AIX issue. requires = [ - "setuptools>=38.6.0", + "setuptools>=51.0.0", "wheel", "Cython>=0.29.21,<3", # Note: sync with setup.py - "numpy==1.17.3; python_version=='3.7'", - "numpy==1.18.3; python_version=='3.8'", - "numpy; python_version>='3.9'", + # Numpy requirements for different OS/architectures + # Copied from https://github.com/scipy/scipy/blob/master/pyproject.toml (which is also licensed under BSD) + "numpy==1.17.3; python_version=='3.7' and (platform_machine!='arm64' or platform_system!='Darwin') and platform_machine!='aarch64'", + "numpy==1.18.3; python_version=='3.8' and (platform_machine!='arm64' or platform_system!='Darwin') and platform_machine!='aarch64'", + "numpy==1.19.3; python_version>='3.9' and (platform_machine!='arm64' or platform_system!='Darwin') and platform_machine!='aarch64'", + # Aarch64(Python 3.9 requirements are the same as AMD64) + "numpy==1.19.2; python_version=='3.7' and platform_machine=='aarch64'", + "numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'", + # Darwin Arm64 + "numpy>=1.20.0; python_version=='3.8' and platform_machine=='arm64' and platform_system=='Darwin'", + "numpy>=1.20.0; python_version=='3.9' and platform_machine=='arm64' and platform_system=='Darwin'" ] # uncomment to enable pep517 after versioneer problem is fixed. # https://github.com/python-versioneer/python-versioneer/issues/193 diff --git a/requirements-dev.txt b/requirements-dev.txt index 332059341df48..0341f645e595b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -70,10 +70,11 @@ python-snappy pyqt5>=5.9.2 tables>=3.5.1 s3fs>=0.4.0 -fsspec>=0.7.4 +aiobotocore +fsspec>=0.7.4, <2021.6.0 gcsfs>=0.6.0 sqlalchemy -xarray +xarray<0.19 cftime pyreadstat tabulate>=0.8.3 @@ -81,3 +82,4 @@ natsort git+https://github.com/pydata/pydata-sphinx-theme.git@master numpydoc < 1.2 pandas-dev-flaker==0.2.0 +setuptools>=51.0.0 diff --git a/scripts/generate_pip_deps_from_conda.py b/scripts/generate_pip_deps_from_conda.py index 1ad9ec03925a0..ec4b6c81f764c 100755 --- a/scripts/generate_pip_deps_from_conda.py +++ b/scripts/generate_pip_deps_from_conda.py @@ -13,17 +13,18 @@ $ python scripts/generate_pip_deps_from_conda.py --compare """ import argparse -import os +import pathlib import re import sys +import toml import yaml EXCLUDE = {"python", "c-compiler", "cxx-compiler"} RENAME = {"pytables": "tables", "pyqt": "pyqt5", "dask-core": "dask"} -def conda_package_to_pip(package): +def conda_package_to_pip(package: str): """ Convert a conda package to its pip equivalent. @@ -36,17 +37,13 @@ def conda_package_to_pip(package): package = re.sub("(?<=[^<>])=", "==", package).strip() for compare in ("<=", ">=", "=="): - if compare not in package: - continue + if compare in package: + pkg, version = package.split(compare) + if pkg in EXCLUDE: + return - pkg, version = package.split(compare) - if pkg in EXCLUDE: - return - - if pkg in RENAME: - return "".join((RENAME[pkg], compare, version)) - - break + if pkg in RENAME: + return "".join((RENAME[pkg], compare, version)) if package in EXCLUDE: return @@ -57,16 +54,18 @@ def conda_package_to_pip(package): return package -def main(conda_fname, pip_fname, compare=False): +def generate_pip_from_conda( + conda_path: pathlib.Path, pip_path: pathlib.Path, compare: bool = False +) -> bool: """ Generate the pip dependencies file from the conda file, or compare that they are synchronized (``compare=True``). Parameters ---------- - conda_fname : str + conda_path : pathlib.Path Path to the conda file with dependencies (e.g. `environment.yml`). - pip_fname : str + pip_path : pathlib.Path Path to the pip file with dependencies (e.g. `requirements-dev.txt`). compare : bool, default False Whether to generate the pip file (``False``) or to compare if the @@ -78,8 +77,8 @@ def main(conda_fname, pip_fname, compare=False): bool True if the comparison fails, False otherwise """ - with open(conda_fname) as conda_fd: - deps = yaml.safe_load(conda_fd)["dependencies"] + with conda_path.open() as file: + deps = yaml.safe_load(file)["dependencies"] pip_deps = [] for dep in deps: @@ -88,24 +87,30 @@ def main(conda_fname, pip_fname, compare=False): if conda_dep: pip_deps.append(conda_dep) elif isinstance(dep, dict) and len(dep) == 1 and "pip" in dep: - pip_deps += dep["pip"] + pip_deps.extend(dep["pip"]) else: raise ValueError(f"Unexpected dependency {dep}") - fname = os.path.split(conda_fname)[1] header = ( - f"# This file is auto-generated from {fname}, do not modify.\n" + f"# This file is auto-generated from {conda_path.name}, do not modify.\n" "# See that file for comments about the need/usage of each dependency.\n\n" ) pip_content = header + "\n".join(pip_deps) + "\n" + # add setuptools to requirements-dev.txt + meta = toml.load(pathlib.Path(conda_path.parent, "pyproject.toml")) + for requirement in meta["build-system"]["requires"]: + if "setuptools" in requirement: + pip_content += requirement + pip_content += "\n" + if compare: - with open(pip_fname) as pip_fd: - return pip_content != pip_fd.read() - else: - with open(pip_fname, "w") as pip_fd: - pip_fd.write(pip_content) - return False + with pip_path.open() as file: + return pip_content != file.read() + + with pip_path.open("w") as file: + file.write(pip_content) + return False if __name__ == "__main__": @@ -117,25 +122,20 @@ def main(conda_fname, pip_fname, compare=False): action="store_true", help="compare whether the two files are equivalent", ) - argparser.add_argument( - "--azure", action="store_true", help="show the output in azure-pipelines format" - ) args = argparser.parse_args() - repo_path = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) - res = main( - os.path.join(repo_path, "environment.yml"), - os.path.join(repo_path, "requirements-dev.txt"), + conda_fname = "environment.yml" + pip_fname = "requirements-dev.txt" + repo_path = pathlib.Path(__file__).parent.parent.absolute() + res = generate_pip_from_conda( + pathlib.Path(repo_path, conda_fname), + pathlib.Path(repo_path, pip_fname), compare=args.compare, ) if res: msg = ( - f"`requirements-dev.txt` has to be generated with `{sys.argv[0]}` after " - "`environment.yml` is modified.\n" + f"`{pip_fname}` has to be generated with `{__file__}` after " + f"`{conda_fname}` is modified.\n" ) - if args.azure: - msg = ( - f"##vso[task.logissue type=error;sourcepath=requirements-dev.txt]{msg}" - ) sys.stderr.write(msg) sys.exit(res)