diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ca0c75f9de94f..a5a802c678e20 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -22,7 +22,9 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Looking for unwanted patterns run: ci/code_checks.sh patterns @@ -94,7 +96,9 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Set up pandas uses: ./.github/actions/setup @@ -147,7 +151,9 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Set up pandas uses: ./.github/actions/setup diff --git a/.github/workflows/database.yml b/.github/workflows/database.yml index 69f2e689c0228..b15889351386a 100644 --- a/.github/workflows/database.yml +++ b/.github/workflows/database.yml @@ -56,10 +56,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Cache conda - uses: actions/cache@v1 + uses: actions/cache@v2 env: CACHE_NUMBER: 0 with: diff --git a/.github/workflows/posix.yml b/.github/workflows/posix.yml index 34e6c2c9d94ce..3a4d3c106f851 100644 --- a/.github/workflows/posix.yml +++ b/.github/workflows/posix.yml @@ -44,10 +44,12 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v1 + uses: actions/checkout@v2 + with: + fetch-depth: 0 - name: Cache conda - uses: actions/cache@v1 + uses: actions/cache@v2 env: CACHE_NUMBER: 0 with: diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1fbd3cf85383e..3078619ecac35 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,14 +19,14 @@ repos: types_or: [python, rst, markdown] files: ^(pandas|doc)/ - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v3.4.0 + rev: v4.0.1 hooks: - id: debug-statements - id: end-of-file-fixer exclude: \.txt$ - id: trailing-whitespace - repo: https://github.com/cpplint/cpplint - rev: f7061b1 # the latest tag does not have the hook + rev: 1.5.5 hooks: - id: cpplint # We don't lint all C files because we don't want to lint any that are built @@ -57,7 +57,7 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v2.12.0 + rev: v2.18.3 hooks: - id: pyupgrade args: [--py37-plus] @@ -72,7 +72,7 @@ repos: types: [text] # overwrite types: [rst] types_or: [python, rst] - repo: https://github.com/asottile/yesqa - rev: v1.2.2 + rev: v1.2.3 hooks: - id: yesqa additional_dependencies: diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 540cd026a43d5..0000000000000 --- a/.travis.yml +++ /dev/null @@ -1,73 +0,0 @@ -language: python -python: 3.7 - -addons: - apt: - update: true - packages: - - xvfb - -services: - - xvfb - -# To turn off cached cython files and compiler cache -# set NOCACHE-true -# To delete caches go to https://travis-ci.org/OWNER/REPOSITORY/caches or run -# travis cache --delete inside the project directory from the travis command line client -# The cache directories will be deleted if anything in ci/ changes in a commit -cache: - apt: true - ccache: true - directories: - - $HOME/.cache # cython cache - -env: - global: - # create a github personal access token - # cd pandas-dev/pandas - # travis encrypt 'PANDAS_GH_TOKEN=personal_access_token' -r pandas-dev/pandas - - secure: "EkWLZhbrp/mXJOx38CHjs7BnjXafsqHtwxPQrqWy457VDFWhIY1DMnIR/lOWG+a20Qv52sCsFtiZEmMfUjf0pLGXOqurdxbYBGJ7/ikFLk9yV2rDwiArUlVM9bWFnFxHvdz9zewBH55WurrY4ShZWyV+x2dWjjceWG5VpWeI6sA=" - -git: - depth: false - -matrix: - fast_finish: true - - include: - - arch: arm64-graviton2 - virt: lxd - group: edge - env: - - JOB="3.7, arm64" PYTEST_WORKERS="auto" ENV_FILE="ci/deps/travis-37-arm64.yaml" PATTERN="(not slow and not network and not clipboard and not arm_slow)" - -before_install: - - echo "before_install" - # Use blocking IO on travis. Ref: https://github.com/travis-ci/travis-ci/issues/8920#issuecomment-352661024 - - python -c 'import os,sys,fcntl; flags = fcntl.fcntl(sys.stdout, fcntl.F_GETFL); fcntl.fcntl(sys.stdout, fcntl.F_SETFL, flags&~os.O_NONBLOCK);' - - source ci/travis_process_gbq_encryption.sh - - export PATH="$HOME/miniconda3/bin:$PATH" - - df -h - - pwd - - uname -a - - git --version - - ./ci/check_git_tags.sh - -install: - - echo "install start" - - ci/prep_cython_cache.sh - - ci/setup_env.sh - - ci/submit_cython_cache.sh - - echo "install done" - -script: - - echo "script start" - - echo "$JOB" - - source activate pandas-dev - - ci/run_tests.sh - -after_script: - - echo "after_script start" - - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - - ci/print_skipped.py - - echo "after_script done" diff --git a/ci/check_git_tags.sh b/ci/check_git_tags.sh deleted file mode 100755 index 9dbcd4f98683e..0000000000000 --- a/ci/check_git_tags.sh +++ /dev/null @@ -1,28 +0,0 @@ -set -e - -if [[ ! $(git tag) ]]; then - echo "No git tags in clone, please sync your git tags with upstream using:" - echo " git fetch --tags upstream" - echo " git push --tags origin" - echo "" - echo "If the issue persists, the clone depth needs to be increased in .travis.yml" - exit 1 -fi - -# This will error if there are no tags and we omit --always -DESCRIPTION=$(git describe --long --tags) -echo "$DESCRIPTION" - -if [[ "$DESCRIPTION" == *"untagged"* ]]; then - echo "Unable to determine most recent tag, aborting build" - exit 1 -else - if [[ "$DESCRIPTION" != *"g"* ]]; then - # A good description will have the hash prefixed by g, a bad one will be - # just the hash - echo "Unable to determine most recent tag, aborting build" - exit 1 - else - echo "$(git tag)" - fi -fi diff --git a/ci/deps/actions-37-db-min.yaml b/ci/deps/actions-37-db-min.yaml index 65c4c5769b1a3..cae4361ca37a7 100644 --- a/ci/deps/actions-37-db-min.yaml +++ b/ci/deps/actions-37-db-min.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml index fa58f412cebf4..e568f8615a8df 100644 --- a/ci/deps/actions-37-db.yaml +++ b/ci/deps/actions-37-db.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-cov>=2.10.1 # this is only needed in the coverage build, ref: GH 35737 @@ -25,7 +25,7 @@ dependencies: - flask - nomkl - numexpr - - numpy=1.16.* + - numpy=1.17.* - odfpy - openpyxl - pandas-gbq diff --git a/ci/deps/actions-37-locale_slow.yaml b/ci/deps/actions-37-locale_slow.yaml index d9ad1f538908e..c6eb3b00a63ac 100644 --- a/ci/deps/actions-37-locale_slow.yaml +++ b/ci/deps/actions-37-locale_slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 @@ -17,13 +17,13 @@ dependencies: - bottleneck=1.2.* - lxml - matplotlib=3.0.0 - - numpy=1.16.* + - numpy=1.17.* - openpyxl=3.0.0 - python-dateutil - python-blosc - pytz=2017.3 - scipy - - sqlalchemy=1.2.8 + - sqlalchemy=1.3.0 - xlrd=1.2.0 - xlsxwriter=1.0.2 - xlwt=1.3.0 diff --git a/ci/deps/actions-37-slow.yaml b/ci/deps/actions-37-slow.yaml index 573ff7f02c162..166f2237dcad3 100644 --- a/ci/deps/actions-37-slow.yaml +++ b/ci/deps/actions-37-slow.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-37.yaml b/ci/deps/actions-37.yaml index a209a9099d2bb..0effe6f80df86 100644 --- a/ci/deps/actions-37.yaml +++ b/ci/deps/actions-37.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-38-locale.yaml b/ci/deps/actions-38-locale.yaml index 629804c71e726..34a6860936550 100644 --- a/ci/deps/actions-38-locale.yaml +++ b/ci/deps/actions-38-locale.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - pytest-asyncio>=0.12.0 @@ -20,7 +20,7 @@ dependencies: - jinja2 - jedi<0.18.0 - lxml - - matplotlib <3.3.0 + - matplotlib<3.3.0 - moto - nomkl - numexpr diff --git a/ci/deps/actions-38-numpydev.yaml b/ci/deps/actions-38-numpydev.yaml index e7ee6ccfd7bac..6eed2daac0c3b 100644 --- a/ci/deps/actions-38-numpydev.yaml +++ b/ci/deps/actions-38-numpydev.yaml @@ -5,14 +5,14 @@ dependencies: - python=3.8.* # tools - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 # pandas dependencies - pytz - - pip=20.2 + - pip - pip: - cython==0.29.21 # GH#34014 - "git+git://github.com/dateutil/dateutil.git" diff --git a/ci/deps/actions-38-slow.yaml b/ci/deps/actions-38-slow.yaml index 2106f48755560..afba60e451b90 100644 --- a/ci/deps/actions-38-slow.yaml +++ b/ci/deps/actions-38-slow.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index e2660d07c3558..11daa92046eb4 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 36e8bf528fc3e..b74f1af8ee0f6 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-cov - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/deps/azure-macos-37.yaml b/ci/deps/azure-macos-37.yaml index a0b1cdc684d2c..63e858eac433f 100644 --- a/ci/deps/azure-macos-37.yaml +++ b/ci/deps/azure-macos-37.yaml @@ -6,7 +6,7 @@ dependencies: - python=3.7.* # tools - - pytest>=5.0.1 + - pytest>=6.0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 8266e3bc4d07d..5cbc029f8c03d 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 200e695a69d1f..7fdecae626f9d 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -7,7 +7,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 - pytest-azurepipelines diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/circle-37-arm64.yaml similarity index 93% rename from ci/deps/travis-37-arm64.yaml rename to ci/deps/circle-37-arm64.yaml index 8df6104f43a50..995ebda1f97e7 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/circle-37-arm64.yaml @@ -6,7 +6,7 @@ dependencies: # tools - cython>=0.29.21 - - pytest>=5.0.1 + - pytest>=6.0 - pytest-xdist>=1.21 - hypothesis>=3.58.0 diff --git a/ci/prep_cython_cache.sh b/ci/prep_cython_cache.sh deleted file mode 100755 index 18d9388327ddc..0000000000000 --- a/ci/prep_cython_cache.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - -ls "$HOME/.cache/" - -PYX_CACHE_DIR="$HOME/.cache/pyxfiles" -pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` -pyx_cache_file_list=`find ${PYX_CACHE_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` - -CACHE_File="$HOME/.cache/cython_files.tar" - -# Clear the cython cache 0 = NO, 1 = YES -clear_cache=0 - -pyx_files=`echo "$pyx_file_list" | wc -l` -pyx_cache_files=`echo "$pyx_cache_file_list" | wc -l` - -if [[ pyx_files -ne pyx_cache_files ]] -then - echo "Different number of pyx files" - clear_cache=1 -fi - -home_dir=$(pwd) - -if [ -f "$CACHE_File" ] && [ -z "$NOCACHE" ] && [ -d "$PYX_CACHE_DIR" ]; then - - echo "Cache available - checking pyx diff" - - for i in ${pyx_file_list} - do - diff=`diff -u $i $PYX_CACHE_DIR${i}` - if [[ $? -eq 2 ]] - then - echo "${i##*/} can't be diffed; probably not in cache" - clear_cache=1 - fi - if [[ ! -z $diff ]] - then - echo "${i##*/} has changed:" - echo $diff - clear_cache=1 - fi - done - - if [ "$TRAVIS_PULL_REQUEST" == "false" ] - then - echo "Not a PR" - # Uncomment next 2 lines to turn off cython caching not in a PR - # echo "Non PR cython caching is disabled" - # clear_cache=1 - else - echo "In a PR" - # Uncomment next 2 lines to turn off cython caching in a PR - # echo "PR cython caching is disabled" - # clear_cache=1 - fi - -fi - -if [ $clear_cache -eq 0 ] && [ -z "$NOCACHE" ] -then - # No and nocache is not set - echo "Will reuse cached cython file" - cd / - tar xvmf $CACHE_File - cd $home_dir -else - echo "Rebuilding cythonized files" - echo "No cache = $NOCACHE" - echo "Clear cache (1=YES) = $clear_cache" -fi - - -exit 0 diff --git a/ci/setup_env.sh b/ci/setup_env.sh index c36422884f2ec..2e16bc6545161 100755 --- a/ci/setup_env.sh +++ b/ci/setup_env.sh @@ -12,41 +12,30 @@ if [[ "$(uname)" == "Linux" && -n "$LC_ALL" ]]; then echo fi -MINICONDA_DIR="$HOME/miniconda3" - - -if [ -d "$MINICONDA_DIR" ]; then - echo - echo "rm -rf "$MINICONDA_DIR"" - rm -rf "$MINICONDA_DIR" -fi echo "Install Miniconda" -UNAME_OS=$(uname) -if [[ "$UNAME_OS" == 'Linux' ]]; then +DEFAULT_CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest" +if [[ "$(uname -m)" == 'aarch64' ]]; then + CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.10.1-4/Miniforge3-4.10.1-4-Linux-aarch64.sh" +elif [[ "$(uname)" == 'Linux' ]]; then if [[ "$BITS32" == "yes" ]]; then - CONDA_OS="Linux-x86" + CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86.sh" else - CONDA_OS="Linux-x86_64" + CONDA_URL="$DEFAULT_CONDA_URL-Linux-x86_64.sh" fi -elif [[ "$UNAME_OS" == 'Darwin' ]]; then - CONDA_OS="MacOSX-x86_64" +elif [[ "$(uname)" == 'Darwin' ]]; then + CONDA_URL="$DEFAULT_CONDA_URL-MacOSX-x86_64.sh" else - echo "OS $UNAME_OS not supported" + echo "OS $(uname) not supported" exit 1 fi - -if [ "${TRAVIS_CPU_ARCH}" == "arm64" ]; then - CONDA_URL="https://github.com/conda-forge/miniforge/releases/download/4.8.5-1/Miniforge3-4.8.5-1-Linux-aarch64.sh" -else - CONDA_URL="https://repo.continuum.io/miniconda/Miniconda3-latest-$CONDA_OS.sh" -fi +echo "Downloading $CONDA_URL" wget -q $CONDA_URL -O miniconda.sh chmod +x miniconda.sh -# Installation path is required for ARM64 platform as miniforge script installs in path $HOME/miniforge3. +MINICONDA_DIR="$HOME/miniconda3" +rm -rf $MINICONDA_DIR ./miniconda.sh -b -p $MINICONDA_DIR - export PATH=$MINICONDA_DIR/bin:$PATH echo @@ -63,29 +52,6 @@ conda update -n base conda echo "conda info -a" conda info -a -echo -echo "set the compiler cache to work" -if [ -z "$NOCACHE" ] && [ "${TRAVIS_OS_NAME}" == "linux" ]; then - echo "Using ccache" - export PATH=/usr/lib/ccache:/usr/lib64/ccache:$PATH - GCC=$(which gcc) - echo "gcc: $GCC" - CCACHE=$(which ccache) - echo "ccache: $CCACHE" - export CC='ccache gcc' -elif [ -z "$NOCACHE" ] && [ "${TRAVIS_OS_NAME}" == "osx" ]; then - echo "Install ccache" - brew install ccache > /dev/null 2>&1 - echo "Using ccache" - export PATH=/usr/local/opt/ccache/libexec:$PATH - gcc=$(which gcc) - echo "gcc: $gcc" - CCACHE=$(which ccache) - echo "ccache: $CCACHE" -else - echo "Not using ccache" -fi - echo "source deactivate" source deactivate diff --git a/ci/submit_cython_cache.sh b/ci/submit_cython_cache.sh deleted file mode 100755 index b87acef0ba11c..0000000000000 --- a/ci/submit_cython_cache.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -CACHE_File="$HOME/.cache/cython_files.tar" -PYX_CACHE_DIR="$HOME/.cache/pyxfiles" -pyx_file_list=`find ${TRAVIS_BUILD_DIR} -name "*.pyx" -o -name "*.pxd" -o -name "*.pxi.in"` - -rm -rf $CACHE_File -rm -rf $PYX_CACHE_DIR - -home_dir=$(pwd) - -mkdir -p $PYX_CACHE_DIR -rsync -Rv $pyx_file_list $PYX_CACHE_DIR - -echo "pyx files:" -echo $pyx_file_list - -tar cf ${CACHE_File} --files-from /dev/null - -for i in ${pyx_file_list} -do - f=${i%.pyx} - ls $f.{c,cpp} | tar rf ${CACHE_File} -T - -done - -echo "Cython files in cache tar:" -tar tvf ${CACHE_File} - -exit 0 diff --git a/ci/travis_encrypt_gbq.sh b/ci/travis_encrypt_gbq.sh deleted file mode 100755 index 7d5692d9520af..0000000000000 --- a/ci/travis_encrypt_gbq.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -GBQ_JSON_FILE=$1 - -if [[ $# -ne 1 ]]; then - echo -e "Too few arguments.\nUsage: ./travis_encrypt_gbq.sh "\ - "" - exit 1 -fi - -if [[ $GBQ_JSON_FILE != *.json ]]; then - echo "ERROR: Expected *.json file" - exit 1 -fi - -if [[ ! -f $GBQ_JSON_FILE ]]; then - echo "ERROR: File $GBQ_JSON_FILE does not exist" - exit 1 -fi - -echo "Encrypting $GBQ_JSON_FILE..." -read -d "\n" TRAVIS_KEY TRAVIS_IV <<<$(travis encrypt-file -r pandas-dev/pandas $GBQ_JSON_FILE \ -travis_gbq.json.enc -f | grep -o "\w*_iv\|\w*_key"); - -echo "Adding your secure key to travis_gbq_config.txt ..." -echo -e "TRAVIS_IV_ENV=$TRAVIS_IV\nTRAVIS_KEY_ENV=$TRAVIS_KEY"\ -> travis_gbq_config.txt - -echo "Done. Removing file $GBQ_JSON_FILE" -rm $GBQ_JSON_FILE - -echo -e "Created encrypted credentials file travis_gbq.json.enc.\n"\ - "NOTE: Do NOT commit the *.json file containing your unencrypted" \ - "private key" diff --git a/ci/travis_gbq.json.enc b/ci/travis_gbq.json.enc deleted file mode 100644 index 6e0b6cee4048c..0000000000000 Binary files a/ci/travis_gbq.json.enc and /dev/null differ diff --git a/ci/travis_gbq_config.txt b/ci/travis_gbq_config.txt deleted file mode 100644 index dc857c450331c..0000000000000 --- a/ci/travis_gbq_config.txt +++ /dev/null @@ -1,2 +0,0 @@ -TRAVIS_IV_ENV=encrypted_e05c934e101e_iv -TRAVIS_KEY_ENV=encrypted_e05c934e101e_key diff --git a/ci/travis_process_gbq_encryption.sh b/ci/travis_process_gbq_encryption.sh deleted file mode 100755 index b5118ad5defc6..0000000000000 --- a/ci/travis_process_gbq_encryption.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -source ci/travis_gbq_config.txt - -if [[ -n ${SERVICE_ACCOUNT_KEY} ]]; then - echo "${SERVICE_ACCOUNT_KEY}" > ci/travis_gbq.json; -elif [[ -n ${!TRAVIS_IV_ENV} ]]; then - openssl aes-256-cbc -K ${!TRAVIS_KEY_ENV} -iv ${!TRAVIS_IV_ENV} \ - -in ci/travis_gbq.json.enc -out ci/travis_gbq.json -d; - export GBQ_PROJECT_ID='pandas-gbq-tests'; - echo 'Successfully decrypted gbq credentials' -fi diff --git a/doc/README.rst b/doc/README.rst deleted file mode 100644 index 5423e7419d03b..0000000000000 --- a/doc/README.rst +++ /dev/null @@ -1 +0,0 @@ -See `contributing.rst `_ in this repo. diff --git a/doc/source/_static/style/latex_1.png b/doc/source/_static/style/latex_1.png new file mode 100644 index 0000000000000..8b901878a0ec9 Binary files /dev/null and b/doc/source/_static/style/latex_1.png differ diff --git a/doc/source/_static/style/latex_2.png b/doc/source/_static/style/latex_2.png new file mode 100644 index 0000000000000..7d6baa681575e Binary files /dev/null and b/doc/source/_static/style/latex_2.png differ diff --git a/doc/source/_static/style/tg_ax0.png b/doc/source/_static/style/tg_ax0.png new file mode 100644 index 0000000000000..3460329352282 Binary files /dev/null and b/doc/source/_static/style/tg_ax0.png differ diff --git a/doc/source/_static/style/tg_axNone.png b/doc/source/_static/style/tg_axNone.png new file mode 100644 index 0000000000000..00357f7eb016b Binary files /dev/null and b/doc/source/_static/style/tg_axNone.png differ diff --git a/doc/source/_static/style/tg_axNone_gmap.png b/doc/source/_static/style/tg_axNone_gmap.png new file mode 100644 index 0000000000000..d06a4b244a23d Binary files /dev/null and b/doc/source/_static/style/tg_axNone_gmap.png differ diff --git a/doc/source/_static/style/tg_axNone_lowhigh.png b/doc/source/_static/style/tg_axNone_lowhigh.png new file mode 100644 index 0000000000000..bc3fb16ee8e40 Binary files /dev/null and b/doc/source/_static/style/tg_axNone_lowhigh.png differ diff --git a/doc/source/_static/style/tg_axNone_vminvmax.png b/doc/source/_static/style/tg_axNone_vminvmax.png new file mode 100644 index 0000000000000..42579c2840fb9 Binary files /dev/null and b/doc/source/_static/style/tg_axNone_vminvmax.png differ diff --git a/doc/source/_static/style/tg_gmap.png b/doc/source/_static/style/tg_gmap.png new file mode 100644 index 0000000000000..fb73529544180 Binary files /dev/null and b/doc/source/_static/style/tg_gmap.png differ diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index bc2325f15852c..ee061e7b7d3e6 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -75,12 +75,12 @@ Statsmodels leverages pandas objects as the underlying data container for comput Use pandas DataFrames in your `scikit-learn `__ ML pipeline. -`Featuretools `__ +`Featuretools `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. -`Compose `__ +`Compose `__ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Compose is a machine learning tool for labeling data and prediction engineering. It allows you to structure the labeling process by parameterizing prediction problems and transforming time-driven relational data into target values with cutoff times that can be used for supervised learning. @@ -551,11 +551,12 @@ Library Accessor Classes Description ================== ============ ==================================== =============================================================================== `cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses. `pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. -`pandas-genomics`_ ``genomics`` ``Series``, ``DataFrame`` Provides common operations for quality control and analysis of genomics data +`pandas-genomics`_ ``genomics`` ``Series``, ``DataFrame`` Provides common operations for quality control and analysis of genomics data. `pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. `pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames. `composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing. `datatest`_ ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers. +`woodwork`_ ``ww`` ``Series``, ``DataFrame`` Provides physical, logical, and semantic data typing information for Series and DataFrames. ================== ============ ==================================== =============================================================================== .. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest @@ -565,5 +566,6 @@ Library Accessor Classes Description .. _pandas_path: https://github.com/drivendataorg/pandas-path/ .. _pathlib.Path: https://docs.python.org/3/library/pathlib.html .. _pint-pandas: https://github.com/hgrecco/pint-pandas -.. _composeml: https://github.com/FeatureLabs/compose +.. _composeml: https://github.com/alteryx/compose .. _datatest: https://datatest.readthedocs.io/ +.. _woodwork: https://github.com/alteryx/woodwork diff --git a/doc/source/reference/style.rst b/doc/source/reference/style.rst index 8c443f3ae9bb6..0d743b5fe8b8b 100644 --- a/doc/source/reference/style.rst +++ b/doc/source/reference/style.rst @@ -24,6 +24,7 @@ Styler properties Styler.env Styler.template_html + Styler.template_latex Styler.loader Style application @@ -55,6 +56,7 @@ Builtin styles Styler.highlight_min Styler.highlight_between Styler.background_gradient + Styler.text_gradient Styler.bar Style export and import @@ -66,3 +68,4 @@ Style export and import Styler.export Styler.use Styler.to_excel + Styler.to_latex diff --git a/doc/source/user_guide/groupby.rst b/doc/source/user_guide/groupby.rst index ef6d45fa0140b..7a55acbd3031d 100644 --- a/doc/source/user_guide/groupby.rst +++ b/doc/source/user_guide/groupby.rst @@ -1000,6 +1000,7 @@ instance method on each data group. This is pretty easy to do by passing lambda functions: .. ipython:: python + :okwarning: grouped = df.groupby("A") grouped.agg(lambda x: x.std()) @@ -1009,6 +1010,7 @@ arguments. Using a bit of metaprogramming cleverness, GroupBy now has the ability to "dispatch" method calls to the groups: .. ipython:: python + :okwarning: grouped.std() diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 7f0cd613726dc..b4e35d1f22840 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -22,6 +22,7 @@ The pandas I/O API is a set of top level ``reader`` functions accessed like text;Fixed-Width Text File;:ref:`read_fwf` text;`JSON `__;:ref:`read_json`;:ref:`to_json` text;`HTML `__;:ref:`read_html`;:ref:`to_html` + text;`LaTeX `__;;:ref:`Styler.to_latex` text;`XML `__;:ref:`read_xml`;:ref:`to_xml` text; Local clipboard;:ref:`read_clipboard`;:ref:`to_clipboard` binary;`MS Excel `__;:ref:`read_excel`;:ref:`to_excel` @@ -343,16 +344,33 @@ dialect : str or :class:`python:csv.Dialect` instance, default ``None`` Error handling ++++++++++++++ -error_bad_lines : boolean, default ``True`` +error_bad_lines : boolean, default ``None`` Lines with too many fields (e.g. a csv line with too many commas) will by default cause an exception to be raised, and no ``DataFrame`` will be returned. If ``False``, then these "bad lines" will dropped from the ``DataFrame`` that is returned. See :ref:`bad lines ` below. -warn_bad_lines : boolean, default ``True`` + + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter should be used instead to specify behavior upon + encountering a bad line instead. +warn_bad_lines : boolean, default ``None`` If error_bad_lines is ``False``, and warn_bad_lines is ``True``, a warning for each "bad line" will be output. + .. deprecated:: 1.3 + The ``on_bad_lines`` parameter should be used instead to specify behavior upon + encountering a bad line instead. +on_bad_lines : {{'error', 'warn', 'skip'}}, default 'error' + Specifies what to do upon encountering a bad line (a line with too many fields). + Allowed values are : + + - 'error', raise an ParserError when a bad line is encountered. + - 'warn', print a warning when a bad line is encountered and skip that line. + - 'skip', skip bad lines without raising or warning when they are encountered. + + .. versionadded:: 1.3 + .. _io.dtypes: Specifying column data types @@ -1244,7 +1262,7 @@ You can elect to skip bad lines: .. code-block:: ipython - In [29]: pd.read_csv(StringIO(data), error_bad_lines=False) + In [29]: pd.read_csv(StringIO(data), on_bad_lines="warn") Skipping line 3: expected 3 fields, saw 4 Out[29]: @@ -1896,7 +1914,7 @@ Writing in ISO date format: dfd = pd.DataFrame(np.random.randn(5, 2), columns=list("AB")) dfd["date"] = pd.Timestamp("20130101") - dfd = dfd.sort_index(1, ascending=False) + dfd = dfd.sort_index(axis=1, ascending=False) json = dfd.to_json(date_format="iso") json @@ -2830,7 +2848,42 @@ parse HTML tables in the top-level pandas io function ``read_html``. .. |lxml| replace:: **lxml** .. _lxml: https://lxml.de +.. _io.latex: + +LaTeX +----- + +.. versionadded:: 1.3.0 + +Currently there are no methods to read from LaTeX, only output methods. + +Writing to LaTeX files +'''''''''''''''''''''' + +.. note:: + + DataFrame *and* Styler objects currently have a ``to_latex`` method. We recommend + using the `Styler.to_latex() <../reference/api/pandas.io.formats.style.Styler.to_latex.rst>`__ method + over `DataFrame.to_latex() <../reference/api/pandas.DataFrame.to_latex.rst>`__ due to the former's greater flexibility with + conditional styling, and the latter's possible future deprecation. + +Review the documentation for `Styler.to_latex <../reference/api/pandas.io.formats.style.Styler.to_latex.rst>`__, +which gives examples of conditional styling and explains the operation of its keyword +arguments. + +For simple application the following pattern is sufficient. + +.. ipython:: python + + df = pd.DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["c", "d"]) + print(df.style.to_latex()) +To format values before output, chain the `Styler.format <../reference/api/pandas.io.formats.style.Styler.format.rst>`__ +method. + +.. ipython:: python + + print(df.style.format("€ {}").to_latex()) XML --- @@ -3648,15 +3701,6 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. df1.to_excel(writer, sheet_name="Sheet1") df2.to_excel(writer, sheet_name="Sheet2") -.. note:: - - Wringing a little more performance out of ``read_excel`` - Internally, Excel stores all numeric data as floats. Because this can - produce unexpected behavior when reading in data, pandas defaults to trying - to convert integers to floats if it doesn't lose information (``1.0 --> - 1``). You can pass ``convert_float=False`` to disable this behavior, which - may give a slight performance improvement. - .. _io.excel_writing_buffer: Writing Excel files to memory diff --git a/doc/source/user_guide/style.ipynb b/doc/source/user_guide/style.ipynb index 86696cc909764..7d8d8e90dfbda 100644 --- a/doc/source/user_guide/style.ipynb +++ b/doc/source/user_guide/style.ipynb @@ -1012,7 +1012,8 @@ " - [.highlight_min][minfunc] and [.highlight_max][maxfunc]: for use with identifying extremeties in data.\n", " - [.highlight_between][betweenfunc] and [.highlight_quantile][quantilefunc]: for use with identifying classes within data.\n", " - [.background_gradient][bgfunc]: a flexible method for highlighting cells based or their, or other, values on a numeric scale.\n", - " - [.bar][barfunc]: to display mini-charts within cell backgrounds.\n", + " - [.text_gradient][textfunc]: similar method for highlighting text based on their, or other, values on a numeric scale.\n", + " - [.bar][barfunc]: to display mini-charts within cell backgrounds.\n", " \n", "The individual documentation on each function often gives more examples of their arguments.\n", "\n", @@ -1022,6 +1023,7 @@ "[betweenfunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_between.rst\n", "[quantilefunc]: ../reference/api/pandas.io.formats.style.Styler.highlight_quantile.rst\n", "[bgfunc]: ../reference/api/pandas.io.formats.style.Styler.background_gradient.rst\n", + "[textfunc]: ../reference/api/pandas.io.formats.style.Styler.text_gradient.rst\n", "[barfunc]: ../reference/api/pandas.io.formats.style.Styler.bar.rst" ] }, @@ -1098,14 +1100,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Background Gradient" + "### Background Gradient and Text Gradient" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "You can create \"heatmaps\" with the `background_gradient` method. These require matplotlib, and we'll use [Seaborn](https://stanford.edu/~mwaskom/software/seaborn/) to get a nice colormap." + "You can create \"heatmaps\" with the `background_gradient` and `text_gradient` methods. These require matplotlib, and we'll use [Seaborn](https://stanford.edu/~mwaskom/software/seaborn/) to get a nice colormap." ] }, { @@ -1120,19 +1122,31 @@ "df2.style.background_gradient(cmap=cm)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df2.style.text_gradient(cmap=cm)" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "[.background_gradient][bgfunc] has a number of keyword arguments to customise the gradients and colors. See its documentation.\n", + "[.background_gradient][bgfunc] and [.text_gradient][textfunc] have a number of keyword arguments to customise the gradients and colors. See the documentation.\n", "\n", - "[bgfunc]: ../reference/api/pandas.io.formats.style.Styler.background_gradient.rst" + "[bgfunc]: ../reference/api/pandas.io.formats.style.Styler.background_gradient.rst\n", + "[textfunc]: ../reference/api/pandas.io.formats.style.Styler.text_gradient.rst" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ + "### Set properties\n", + "\n", "Use `Styler.set_properties` when the style doesn't actually depend on the values. This is just a simple wrapper for `.applymap` where the function returns the same properties for all cells." ] }, @@ -1448,7 +1462,7 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.format(escape=True)" + "df4.style.format(escape=\"html\")" ] }, { @@ -1457,7 +1471,7 @@ "metadata": {}, "outputs": [], "source": [ - "df4.style.format('{}', escape=True)" + "df4.style.format('{}', escape=\"html\")" ] }, { diff --git a/doc/source/whatsnew/v1.2.5.rst b/doc/source/whatsnew/v1.2.5.rst index 60e146b2212eb..500030e1304c6 100644 --- a/doc/source/whatsnew/v1.2.5.rst +++ b/doc/source/whatsnew/v1.2.5.rst @@ -15,8 +15,9 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ - Regression in :func:`concat` between two :class:`DataFrames` where one has an :class:`Index` that is all-None and the other is :class:`DatetimeIndex` incorrectly raising (:issue:`40841`) +- Fixed regression in :meth:`DataFrame.sum` and :meth:`DataFrame.prod` when ``min_count`` and ``numeric_only`` are both given (:issue:`41074`) - Regression in :func:`read_csv` when using ``memory_map=True`` with an non-UTF8 encoding (:issue:`40986`) -- +- Regression in :meth:`DataFrame.replace` and :meth:`Series.replace` when the values to replace is a NumPy float array (:issue:`40371`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index d357e4a633347..987a19cf99dd6 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -120,8 +120,8 @@ to allow custom CSS highlighting instead of default background coloring (:issue: Enhancements to other built-in methods include extending the :meth:`.Styler.background_gradient` method to shade elements based on a given gradient map and not be restricted only to values in the DataFrame (:issue:`39930` :issue:`22727` :issue:`28901`). Additional -built-in methods such as :meth:`.Styler.highlight_between` and :meth:`.Styler.highlight_quantile` -have been added (:issue:`39821` and :issue:`40926`). +built-in methods such as :meth:`.Styler.highlight_between`, :meth:`.Styler.highlight_quantile` +and :math:`.Styler.text_gradient` have been added (:issue:`39821`, :issue:`40926`, :issue:`41098`). The :meth:`.Styler.apply` now consistently allows functions with ``ndarray`` output to allow more flexible development of UDFs when ``axis`` is ``None`` ``0`` or ``1`` (:issue:`39393`). @@ -141,6 +141,9 @@ properly format HTML and eliminate some inconsistencies (:issue:`39942` :issue:` :class:`.Styler` has also been compatible with non-unique index or columns, at least for as many features as are fully compatible, others made only partially compatible (:issue:`41269`). One also has greater control of the display through separate sparsification of the index or columns, using the new 'styler' options context (:issue:`41142`). +We have added an extension to allow LaTeX styling as an alternative to CSS styling and a method :meth:`.Styler.to_latex` +which renders the necessary LaTeX format including built-up styles. An additional file io function :meth:`Styler.to_html` has been added for convenience (:issue:`40312`). + Documentation has also seen major revisions in light of new features (:issue:`39720` :issue:`39317` :issue:`40493`) .. _whatsnew_130.dataframe_honors_copy_with_dict: @@ -230,6 +233,7 @@ Other enhancements - Add keyword ``sort`` to :func:`pivot_table` to allow non-sorting of the result (:issue:`39143`) - Add keyword ``dropna`` to :meth:`DataFrame.value_counts` to allow counting rows that include ``NA`` values (:issue:`41325`) - :meth:`Series.replace` will now cast results to ``PeriodDtype`` where possible instead of ``object`` dtype (:issue:`41526`) +- Improved error message in ``corr`` and ``cov`` methods on :class:`.Rolling`, :class:`.Expanding`, and :class:`.ExponentialMovingWindow` when ``other`` is not a :class:`DataFrame` or :class:`Series` (:issue:`41741`) .. --------------------------------------------------------------------------- @@ -439,7 +443,7 @@ In the new behavior, we get a new array, and retain an integer-dtyped ``5``: Consistent Casting With Setting Into Boolean Series ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Setting non-boolean values into a :class:`Series with ``dtype=bool`` consistently +Setting non-boolean values into a :class:`Series` with ``dtype=bool`` consistently cast to ``dtype=object`` (:issue:`38709`) .. ipython:: python @@ -615,7 +619,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | scipy | 1.2.0 | | +-----------------+-----------------+---------+ -| sqlalchemy | 1.2.8 | | +| sqlalchemy | 1.3.0 | X | +-----------------+-----------------+---------+ | tabulate | 0.8.7 | X | +-----------------+-----------------+---------+ @@ -639,6 +643,7 @@ Other API changes - Partially initialized :class:`CategoricalDtype` (i.e. those with ``categories=None`` objects will no longer compare as equal to fully initialized dtype objects. - Accessing ``_constructor_expanddim`` on a :class:`DataFrame` and ``_constructor_sliced`` on a :class:`Series` now raise an ``AttributeError``. Previously a ``NotImplementedError`` was raised (:issue:`38782`) - Added new ``engine`` and ``**engine_kwargs`` parameters to :meth:`DataFrame.to_sql` to support other future "SQL engines". Currently we still only use ``SQLAlchemy`` under the hood, but more engines are planned to be supported such as ``turbodbc`` (:issue:`36893`) +- Removed redundant ``freq`` from :class:`PeriodIndex` string representation (:issue:`41653`) Build ===== @@ -652,6 +657,7 @@ Build Deprecations ~~~~~~~~~~~~ - Deprecated allowing scalars to be passed to the :class:`Categorical` constructor (:issue:`38433`) +- Deprecated constructing :class:`CategoricalIndex` without passing list-like data (:issue:`38944`) - Deprecated allowing subclass-specific keyword arguments in the :class:`Index` constructor, use the specific subclass directly instead (:issue:`14093`, :issue:`21311`, :issue:`22315`, :issue:`26974`) - Deprecated ``astype`` of datetimelike (``timedelta64[ns]``, ``datetime64[ns]``, ``Datetime64TZDtype``, ``PeriodDtype``) to integer dtypes, use ``values.view(...)`` instead (:issue:`38544`) - Deprecated :meth:`MultiIndex.is_lexsorted` and :meth:`MultiIndex.lexsort_depth`, use :meth:`MultiIndex.is_monotonic_increasing` instead (:issue:`32259`) @@ -665,6 +671,7 @@ Deprecations - Deprecated casting ``datetime.date`` objects to ``datetime64`` when used as ``fill_value`` in :meth:`DataFrame.unstack`, :meth:`DataFrame.shift`, :meth:`Series.shift`, and :meth:`DataFrame.reindex`, pass ``pd.Timestamp(dateobj)`` instead (:issue:`39767`) - Deprecated :meth:`.Styler.set_na_rep` and :meth:`.Styler.set_precision` in favour of :meth:`.Styler.format` with ``na_rep`` and ``precision`` as existing and new input arguments respectively (:issue:`40134`, :issue:`40425`) - Deprecated allowing partial failure in :meth:`Series.transform` and :meth:`DataFrame.transform` when ``func`` is list-like or dict-like and raises anything but ``TypeError``; ``func`` raising anything but a ``TypeError`` will raise in a future version (:issue:`40211`) +- Deprecated arguments ``error_bad_lines`` and ``warn_bad_lines`` in :meth:``read_csv`` and :meth:``read_table`` in favor of argument ``on_bad_lines`` (:issue:`15122`) - Deprecated support for ``np.ma.mrecords.MaskedRecords`` in the :class:`DataFrame` constructor, pass ``{name: data[name] for name in data.dtype.names}`` instead (:issue:`40363`) - Deprecated using :func:`merge` or :func:`join` on a different number of levels (:issue:`34862`) - Deprecated the use of ``**kwargs`` in :class:`.ExcelWriter`; use the keyword argument ``engine_kwargs`` instead (:issue:`40430`) @@ -672,12 +679,35 @@ Deprecations - The ``inplace`` parameter of :meth:`Categorical.remove_categories`, :meth:`Categorical.add_categories`, :meth:`Categorical.reorder_categories`, :meth:`Categorical.rename_categories`, :meth:`Categorical.set_categories` is deprecated and will be removed in a future version (:issue:`37643`) - Deprecated :func:`merge` producing duplicated columns through the ``suffixes`` keyword and already existing columns (:issue:`22818`) - Deprecated setting :attr:`Categorical._codes`, create a new :class:`Categorical` with the desired codes instead (:issue:`40606`) +- Deprecated the ``convert_float`` optional argument in :func:`read_excel` and :meth:`ExcelFile.parse` (:issue:`41127`) - Deprecated behavior of :meth:`DatetimeIndex.union` with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`) - Deprecated using ``usecols`` with out of bounds indices for ``read_csv`` with ``engine="c"`` (:issue:`25623`) +- Deprecated passing arguments as positional (except for ``"codes"``) in :meth:`MultiIndex.codes` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`Index.set_names` and :meth:`MultiIndex.set_names` (except for ``names``) (:issue:`41485`) +- Deprecated passing arguments (apart from ``cond`` and ``other``) as positional in :meth:`DataFrame.mask` and :meth:`Series.mask` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.clip` and :meth:`Series.clip` (other than ``"upper"`` and ``"lower"``) (:issue:`41485`) - Deprecated special treatment of lists with first element a Categorical in the :class:`DataFrame` constructor; pass as ``pd.DataFrame({col: categorical, ...})`` instead (:issue:`38845`) +- Deprecated behavior of :class:`DataFrame` constructor when a ``dtype`` is passed and the data cannot be cast to that dtype. In a future version, this will raise instead of being silently ignored (:issue:`24435`) - Deprecated passing arguments as positional (except for ``"method"``) in :meth:`DataFrame.interpolate` and :meth:`Series.interpolate` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.ffill`, :meth:`Series.ffill`, :meth:`DataFrame.bfill`, and :meth:`Series.bfill` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.sort_values` (other than ``"by"``) and :meth:`Series.sort_values` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.dropna` and :meth:`Series.dropna` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.set_index` (other than ``"keys"``) (:issue:`41485`) +- Deprecated passing arguments as positional (except for ``"levels"``) in :meth:`MultiIndex.set_levels` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.drop_duplicates` (except for ``subset``), :meth:`Series.drop_duplicates`, :meth:`Index.drop_duplicates` and :meth:`MultiIndex.drop_duplicates` (:issue:`41485`) - Deprecated passing arguments (apart from ``value``) as positional in :meth:`DataFrame.fillna` and :meth:`Series.fillna` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.reset_index` (other than ``"level"``) and :meth:`Series.reset_index` (:issue:`41485`) - Deprecated construction of :class:`Series` or :class:`DataFrame` with ``DatetimeTZDtype`` data and ``datetime64[ns]`` dtype. Use ``Series(data).dt.tz_localize(None)`` instead (:issue:`41555`,:issue:`33401`) +- Deprecated behavior of :class:`Series` construction with large-integer values and small-integer dtype silently overflowing; use ``Series(data).astype(dtype)`` instead (:issue:`41734`) +- Deprecated inference of ``timedelta64[ns]``, ``datetime64[ns]``, or ``DatetimeTZDtype`` dtypes in :class:`Series` construction when data containing strings is passed and no ``dtype`` is passed (:issue:`33558`) +- In a future version, constructing :class:`Series` or :class:`DataFrame` with ``datetime64[ns]`` data and ``DatetimeTZDtype`` will treat the data as wall-times instead of as UTC times (matching DatetimeIndex behavior). To treat the data as UTC times, use ``pd.Series(data).dt.tz_localize("UTC").dt.tz_convert(dtype.tz)`` or ``pd.Series(data.view("int64"), dtype=dtype)`` (:issue:`33401`) +- Deprecated passing arguments as positional in :meth:`DataFrame.set_axis` and :meth:`Series.set_axis` (other than ``"labels"``) (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.where` and :meth:`Series.where` (other than ``"cond"`` and ``"other"``) (:issue:`41485`) +- Deprecated passing arguments as positional (other than ``filepath_or_buffer``) in :func:`read_csv` (:issue:`41485`) +- Deprecated passing arguments as positional in :meth:`DataFrame.drop` (other than ``"labels"``) and :meth:`Series.drop` (:issue:`41485`) +- Deprecated passing arguments as positional (other than ``filepath_or_buffer``) in :func:`read_table` (:issue:`41485`) + .. _whatsnew_130.deprecations.nuisance_columns: @@ -720,6 +750,44 @@ For example: A 24 dtype: int64 + +Similarly, when applying a function to :class:`DataFrameGroupBy`, columns on which +the function raises ``TypeError`` are currently silently ignored and dropped +from the result. + +This behavior is deprecated. In a future version, the ``TypeError`` +will be raised, and users will need to select only valid columns before calling +the function. + +For example: + +.. ipython:: python + + df = pd.DataFrame({"A": [1, 2, 3, 4], "B": pd.date_range("2016-01-01", periods=4)}) + gb = df.groupby([1, 1, 2, 2]) + +*Old behavior*: + +.. code-block:: ipython + + In [4]: gb.prod(numeric_only=False) + Out[4]: + A + 1 2 + 2 12 + +.. code-block:: ipython + + In [5]: gb.prod(numeric_only=False) + ... + TypeError: datetime64 type does not support prod operations + + In [6]: gb[["A"]].prod(numeric_only=False) + Out[6]: + A + 1 2 + 2 12 + .. --------------------------------------------------------------------------- @@ -828,6 +896,7 @@ Strings - Bug in the conversion from ``pyarrow.ChunkedArray`` to :class:`~arrays.StringArray` when the original had zero chunks (:issue:`41040`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` ignoring replacements with ``regex=True`` for ``StringDType`` data (:issue:`41333`, :issue:`35977`) - Bug in :meth:`Series.str.extract` with :class:`~arrays.StringArray` returning object dtype for empty :class:`DataFrame` (:issue:`41441`) +- Bug in :meth:`Series.str.replace` where the ``case`` argument was ignored when ``regex=False`` (:issue:`41602`) Interval ^^^^^^^^ @@ -839,8 +908,9 @@ Interval Indexing ^^^^^^^^ -- Bug in :meth:`Index.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) +- Bug in :meth:`Index.union` and :meth:`MultiIndex.union` dropping duplicate ``Index`` values when ``Index`` was not monotonic or ``sort`` was set to ``False`` (:issue:`36289`, :issue:`31326`, :issue:`40862`) - Bug in :meth:`CategoricalIndex.get_indexer` failing to raise ``InvalidIndexError`` when non-unique (:issue:`38372`) +- Bug in :meth:`Series.loc` raising ``ValueError`` when input was filtered with a boolean list and values to set were a list with lower dimension (:issue:`20438`) - Bug in inserting many new columns into a :class:`DataFrame` causing incorrect subsequent indexing behavior (:issue:`38380`) - Bug in :meth:`DataFrame.__setitem__` raising ``ValueError`` when setting multiple values to duplicate columns (:issue:`15695`) - Bug in :meth:`DataFrame.loc`, :meth:`Series.loc`, :meth:`DataFrame.__getitem__` and :meth:`Series.__getitem__` returning incorrect elements for non-monotonic :class:`DatetimeIndex` for string slices (:issue:`33146`) @@ -855,7 +925,7 @@ Indexing - Bug in :meth:`Series.__setitem__` raising ``ValueError`` when setting a :class:`Series` with a scalar indexer (:issue:`38303`) - Bug in :meth:`DataFrame.loc` dropping levels of :class:`MultiIndex` when :class:`DataFrame` used as input has only one row (:issue:`10521`) - Bug in :meth:`DataFrame.__getitem__` and :meth:`Series.__getitem__` always raising ``KeyError`` when slicing with existing strings an :class:`Index` with milliseconds (:issue:`33589`) -- Bug in setting ``timedelta64`` or ``datetime64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`, issue:`39619`) +- Bug in setting ``timedelta64`` or ``datetime64`` values into numeric :class:`Series` failing to cast to object dtype (:issue:`39086`, :issue:`39619`) - Bug in setting :class:`Interval` values into a :class:`Series` or :class:`DataFrame` with mismatched :class:`IntervalDtype` incorrectly casting the new values to the existing dtype (:issue:`39120`) - Bug in setting ``datetime64`` values into a :class:`Series` with integer-dtype incorrect casting the datetime64 values to integers (:issue:`39266`) - Bug in setting ``np.datetime64("NaT")`` into a :class:`Series` with :class:`Datetime64TZDtype` incorrectly treating the timezone-naive value as timezone-aware (:issue:`39769`) @@ -869,9 +939,12 @@ Indexing - Bug in :meth:`DataFrame.__setitem__` and :meth:`DataFrame.iloc.__setitem__` raising ``ValueError`` when trying to index with a row-slice and setting a list as values (:issue:`40440`) - Bug in :meth:`DataFrame.loc` not raising ``KeyError`` when key was not found in :class:`MultiIndex` when levels contain more values than used (:issue:`41170`) - Bug in :meth:`DataFrame.loc.__setitem__` when setting-with-expansion incorrectly raising when the index in the expanding axis contains duplicates (:issue:`40096`) +- Bug in :meth:`DataFrame.loc.__getitem__` with :class:`MultiIndex` casting to float when at least one column is from has float dtype and we retrieve a scalar (:issue:`41369`) - Bug in :meth:`DataFrame.loc` incorrectly matching non-boolean index elements (:issue:`20432`) - Bug in :meth:`Series.__delitem__` with ``ExtensionDtype`` incorrectly casting to ``ndarray`` (:issue:`40386`) +- Bug in :meth:`DataFrame.loc` returning :class:`MultiIndex` in wrong order if indexer has duplicates (:issue:`40978`) - Bug in :meth:`DataFrame.__setitem__` raising ``TypeError`` when using a str subclass as the column name with a :class:`DatetimeIndex` (:issue:`37366`) +- Bug in :meth:`PeriodIndex.get_loc` failing to raise ``KeyError`` when given a :class:`Period` with a mismatched ``freq`` (:issue:`41670`) Missing ^^^^^^^ @@ -890,6 +963,7 @@ MultiIndex - Bug in :meth:`MultiIndex.equals` incorrectly returning ``True`` when :class:`MultiIndex` containing ``NaN`` even when they are differently ordered (:issue:`38439`) - Bug in :meth:`MultiIndex.intersection` always returning empty when intersecting with :class:`CategoricalIndex` (:issue:`38653`) - Bug in :meth:`MultiIndex.reindex` raising ``ValueError`` with empty MultiIndex and indexing only a specific level (:issue:`41170`) +- Bug in :meth:`MultiIndex.reindex` raising ``TypeError`` when reindexing against a flat :class:`Index` (:issue:`41707`) I/O ^^^ @@ -922,10 +996,11 @@ I/O - Bug in :func:`read_orc` always raising ``AttributeError`` (:issue:`40918`) - Bug in :func:`read_csv` and :func:`read_table` silently ignoring ``prefix`` if ``names`` and ``prefix`` are defined, now raising ``ValueError`` (:issue:`39123`) - Bug in :func:`read_csv` and :func:`read_excel` not respecting dtype for duplicated column name when ``mangle_dupe_cols`` is set to ``True`` (:issue:`35211`) +- Bug in :func:`read_csv` silently ignoring ``sep`` if ``delimiter`` and ``sep`` are defined, now raising ``ValueError`` (:issue:`39823`) - Bug in :func:`read_csv` and :func:`read_table` misinterpreting arguments when ``sys.setprofile`` had been previously called (:issue:`41069`) - Bug in the conversion from pyarrow to pandas (e.g. for reading Parquet) with nullable dtypes and a pyarrow array whose data buffer size is not a multiple of dtype size (:issue:`40896`) - Bug in :func:`read_excel` would raise an error when pandas could not determine the file type, even when user specified the ``engine`` argument (:issue:`41225`) -- +- Bug in :func:`read_clipboard` copying from an excel file shifts values into the wrong column if there are null values in first column (:issue:`41108`) Period ^^^^^^ @@ -971,7 +1046,7 @@ Groupby/resample/rolling - Bug in :class:`core.window.ewm.ExponentialMovingWindowGroupby` where the times vector and values became out of sync for non-trivial groups (:issue:`40951`) - Bug in :meth:`Series.asfreq` and :meth:`DataFrame.asfreq` dropping rows when the index is not sorted (:issue:`39805`) - Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`) -- Bug in :meth:`SeriesGroupBy.aggregate` where using a user-defined function to aggregate a ``Series`` with an object-typed :class:`Index` causes an incorrect :class:`Index` shape (issue:`40014`) +- Bug in :meth:`SeriesGroupBy.aggregate` where using a user-defined function to aggregate a ``Series`` with an object-typed :class:`Index` causes an incorrect :class:`Index` shape (:issue:`40014`) - Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`) - Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising ``ValueError`` when using with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`) - Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`) @@ -980,11 +1055,15 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.rolling` returning mean zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`) - Bug in :meth:`DataFrame.rolling` returning sum not zero for all ``NaN`` window with ``min_periods=0`` if calculation is not numerical stable (:issue:`41053`) - Bug in :meth:`SeriesGroupBy.agg` failing to retain ordered :class:`CategoricalDtype` on order-preserving aggregations (:issue:`41147`) -- Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:41111`) +- Bug in :meth:`DataFrameGroupBy.min` and :meth:`DataFrameGroupBy.max` with multiple object-dtype columns and ``numeric_only=False`` incorrectly raising ``ValueError`` (:issue:`41111`) - Bug in :meth:`DataFrameGroupBy.rank` with the GroupBy object's ``axis=0`` and the ``rank`` method's keyword ``axis=1`` (:issue:`41320`) - Bug in :meth:`DataFrameGroupBy.__getitem__` with non-unique columns incorrectly returning a malformed :class:`SeriesGroupBy` instead of :class:`DataFrameGroupBy` (:issue:`41427`) - Bug in :meth:`DataFrameGroupBy.transform` with non-unique columns incorrectly raising ``AttributeError`` (:issue:`41427`) - Bug in :meth:`Resampler.apply` with non-unique columns incorrectly dropping duplicated columns (:issue:`41445`) +- Bug in :meth:`SeriesGroupBy` aggregations incorrectly returning empty :class:`Series` instead of raising ``TypeError`` on aggregations that are invalid for its dtype, e.g. ``.prod`` with ``datetime64[ns]`` dtype (:issue:`41342`) +- Bug in :class:`DataFrameGroupBy` aggregations incorrectly failing to drop columns with invalid dtypes for that aggregation when there are no valid columns (:issue:`41291`) +- Bug in :meth:`DataFrame.rolling.__iter__` where ``on`` was not assigned to the index of the resulting objects (:issue:`40373`) +- Bug in :meth:`DataFrameGroupBy.transform` and :meth:`DataFrameGroupBy.agg` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`41647`) Reshaping ^^^^^^^^^ @@ -1000,6 +1079,7 @@ Reshaping - Bug in :meth:`DataFrame.sort_values` not reshaping index correctly after sorting on columns, when ``ignore_index=True`` (:issue:`39464`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``ExtensionDtype`` dtypes (:issue:`39454`) - Bug in :meth:`DataFrame.append` returning incorrect dtypes with combinations of ``datetime64`` and ``timedelta64`` dtypes (:issue:`39574`) +- Bug in :meth:`DataFrame.append` with a :class:`DataFrame` with a :class:`MultiIndex` and appending a :class:`Series` whose :class:`Index` is not a :class:`MultiIndex` (:issue:`41707`) - Bug in :meth:`DataFrame.pivot_table` returning a ``MultiIndex`` for a single value when operating on and empty ``DataFrame`` (:issue:`13483`) - Allow :class:`Index` to be passed to the :func:`numpy.all` function (:issue:`40180`) - Bug in :meth:`DataFrame.stack` not preserving ``CategoricalDtype`` in a ``MultiIndex`` (:issue:`36991`) @@ -1048,10 +1128,13 @@ Other - Bug in :func:`pandas.testing.assert_index_equal` with ``exact=True`` not raising when comparing :class:`CategoricalIndex` instances with ``Int64Index`` and ``RangeIndex`` categories (:issue:`41263`) - Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`) - Bug in :func:`pandas.util.show_versions` where console JSON output was not proper JSON (:issue:`39701`) +- Let Pandas compile on z/OS when using `xlc `_ (:issue:`35826`) - Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised ValueError when called on an empty DataFrame (:issue:`40393`) - Bug in :meth:`DataFrame.agg()` not sorting the aggregated axis in the order of the provided aggragation functions when one or more aggregation function fails to produce results (:issue:`33634`) - Bug in :meth:`DataFrame.clip` not interpreting missing values as no threshold (:issue:`40420`) - Bug in :class:`Series` backed by :class:`DatetimeArray` or :class:`TimedeltaArray` sometimes failing to set the array's ``freq`` to ``None`` (:issue:`41425`) +- Bug in creating a :class:`Series` from a ``range`` object that does not fit in the bounds of ``int64`` dtype (:issue:`30173`) +- Bug in creating a :class:`Series` from a ``dict`` with all-tuple keys and an :class:`Index` that requires reindexing (:issue:`41707`) .. --------------------------------------------------------------------------- diff --git a/doc/sphinxext/announce.py b/doc/sphinxext/announce.py index 2ec0b515ea95c..b0b430ed6a866 100755 --- a/doc/sphinxext/announce.py +++ b/doc/sphinxext/announce.py @@ -54,7 +54,7 @@ def get_authors(revision_range): pat = "^.*\\t(.*)$" - lst_release, cur_release = [r.strip() for r in revision_range.split("..")] + lst_release, cur_release = (r.strip() for r in revision_range.split("..")) if "|" in cur_release: # e.g. v1.0.1|HEAD @@ -119,7 +119,7 @@ def get_pull_requests(repo, revision_range): def build_components(revision_range, heading="Contributors"): - lst_release, cur_release = [r.strip() for r in revision_range.split("..")] + lst_release, cur_release = (r.strip() for r in revision_range.split("..")) authors = get_authors(revision_range) return { diff --git a/pandas/_config/config.py b/pandas/_config/config.py index 455f800073c15..37f5a5730439d 100644 --- a/pandas/_config/config.py +++ b/pandas/_config/config.py @@ -157,7 +157,7 @@ def _describe_option(pat: str = "", _print_desc: bool = True): if len(keys) == 0: raise OptionError("No such keys(s)") - s = "\n".join([_build_option_description(k) for k in keys]) + s = "\n".join(_build_option_description(k) for k in keys) if _print_desc: print(s) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7a286188c4e74..b72b927b3c2a8 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -516,7 +516,7 @@ def group_add(add_t[:, ::1] out, val = values[i, j] # not nan - if val == val: + if not checknull(val): nobs[lab, j] += 1 if nobs[lab, j] == 1: diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 5e1cc612bed57..f91b96dc1b1dc 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -11,7 +11,10 @@ from typing import ( import numpy as np -from pandas._typing import ArrayLike +from pandas._typing import ( + ArrayLike, + DtypeObj, +) # placeholder until we can specify np.ndarray[object, ndim=2] ndarray_obj_2d = np.ndarray @@ -52,8 +55,6 @@ def is_float_array(values: np.ndarray, skipna: bool = False): ... def is_integer_array(values: np.ndarray, skipna: bool = False): ... def is_bool_array(values: np.ndarray, skipna: bool = False): ... -def fast_multiget(mapping: dict, keys: np.ndarray, default=np.nan) -> np.ndarray: ... - def fast_unique_multiple_list_gen(gen: Generator, sort: bool = True) -> list: ... def fast_unique_multiple_list(lists: list, sort: bool = True) -> list: ... def fast_unique_multiple(arrays: list, sort: bool = True) -> list: ... @@ -73,6 +74,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: Literal[False] = ..., convert_to_nullable_integer: Literal[False] = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> np.ndarray: ... @overload @@ -85,6 +87,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_to_nullable_integer: Literal[True] = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -97,6 +100,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_to_nullable_integer: bool = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -109,6 +113,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: Literal[True] = ..., convert_to_nullable_integer: bool = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -121,6 +126,7 @@ def maybe_convert_objects( convert_timedelta: bool = ..., convert_period: bool = ..., convert_to_nullable_integer: bool = ..., + dtype_if_all_nat: DtypeObj | None = ..., ) -> ArrayLike: ... @overload @@ -153,7 +159,7 @@ def ensure_string_array( def infer_datetimelike_array( arr: np.ndarray # np.ndarray[object] -) -> str: ... +) -> tuple[str, bool]: ... def astype_intsafe( arr: np.ndarray, # np.ndarray[object] @@ -185,7 +191,7 @@ def maybe_indices_to_slice( ) -> slice | np.ndarray: ... # np.ndarray[np.uint8] def clean_index_list(obj: list) -> tuple[ - list | np.ndarray, # np.ndarray[object] | np.ndarray[np.int64] + list | np.ndarray, # np.ndarray[object | np.int64 | np.uint64] bool, ]: ... diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index cbef4ed44dc06..4b5ef3e909a00 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -84,6 +84,10 @@ from pandas._libs.util cimport ( ) from pandas._libs.tslib import array_to_datetime +from pandas._libs.tslibs import ( + OutOfBoundsDatetime, + OutOfBoundsTimedelta, +) from pandas._libs.tslibs.period import Period from pandas._libs.missing cimport ( @@ -291,7 +295,7 @@ def item_from_zerodim(val: object) -> object: @cython.wraparound(False) @cython.boundscheck(False) -def fast_unique_multiple(list arrays, sort: bool = True) -> list: +def fast_unique_multiple(list arrays, sort: bool = True): """ Generate a list of unique values from a list of arrays. @@ -747,10 +751,14 @@ def clean_index_list(obj: list): object val bint all_arrays = True + # First check if we have a list of arraylikes, in which case we will + # pass them to MultiIndex.from_arrays for i in range(n): val = obj[i] if not (isinstance(val, list) or util.is_array(val) or hasattr(val, '_data')): + # TODO: EA? + # exclude tuples, frozensets as they may be contained in an Index all_arrays = False break @@ -762,11 +770,21 @@ def clean_index_list(obj: list): if inferred in ['string', 'bytes', 'mixed', 'mixed-integer']: return np.asarray(obj, dtype=object), 0 elif inferred in ['integer']: - # TODO: we infer an integer but it *could* be a uint64 - try: - return np.asarray(obj, dtype='int64'), 0 - except OverflowError: - return np.asarray(obj, dtype='object'), 0 + # we infer an integer but it *could* be a uint64 + + arr = np.asarray(obj) + if arr.dtype.kind not in ["i", "u"]: + # eg [0, uint64max] gets cast to float64, + # but then we know we have either uint64 or object + if (arr < 0).any(): + # TODO: similar to maybe_cast_to_integer_array + return np.asarray(obj, dtype="object"), 0 + + # GH#35481 + guess = np.asarray(obj, dtype="uint64") + return guess, 0 + + return arr, 0 return np.asarray(obj), 0 @@ -1187,6 +1205,7 @@ cdef class Seen: bint timedelta_ # seen_timedelta bint datetimetz_ # seen_datetimetz bint period_ # seen_period + bint interval_ # seen_interval def __cinit__(self, bint coerce_numeric=False): """ @@ -1212,6 +1231,7 @@ cdef class Seen: self.timedelta_ = False self.datetimetz_ = False self.period_ = False + self.interval_ = False self.coerce_numeric = coerce_numeric cdef inline bint check_uint64_conflict(self) except -1: @@ -1461,7 +1481,7 @@ def infer_dtype(value: object, skipna: bool = True) -> str: for i in range(n): val = values[i] - # do not use is_nul_datetimelike to keep + # do not use is_null_datetimelike to keep # np.datetime64('nat') and np.timedelta64('nat') if val is None or util.is_nan(val): pass @@ -1550,15 +1570,13 @@ def infer_dtype(value: object, skipna: bool = True) -> str: for i in range(n): val = values[i] - if (util.is_integer_object(val) and - not util.is_timedelta64_object(val) and - not util.is_datetime64_object(val)): + if util.is_integer_object(val): return "mixed-integer" return "mixed" -def infer_datetimelike_array(arr: ndarray[object]) -> str: +def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]: """ Infer if we have a datetime or timedelta array. - date: we have *only* date and maybe strings, nulls @@ -1576,12 +1594,13 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: Returns ------- str: {datetime, timedelta, date, nat, mixed} + bool """ cdef: Py_ssize_t i, n = len(arr) bint seen_timedelta = False, seen_date = False, seen_datetime = False bint seen_tz_aware = False, seen_tz_naive = False - bint seen_nat = False + bint seen_nat = False, seen_str = False list objs = [] object v @@ -1589,6 +1608,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: v = arr[i] if isinstance(v, str): objs.append(v) + seen_str = True if len(objs) == 3: break @@ -1609,7 +1629,7 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: seen_tz_aware = True if seen_tz_naive and seen_tz_aware: - return "mixed" + return "mixed", seen_str elif util.is_datetime64_object(v): # np.datetime64 seen_datetime = True @@ -1619,16 +1639,16 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: # timedelta, or timedelta64 seen_timedelta = True else: - return "mixed" + return "mixed", seen_str if seen_date and not (seen_datetime or seen_timedelta): - return "date" + return "date", seen_str elif seen_datetime and not seen_timedelta: - return "datetime" + return "datetime", seen_str elif seen_timedelta and not seen_datetime: - return "timedelta" + return "timedelta", seen_str elif seen_nat: - return "nat" + return "nat", seen_str # short-circuit by trying to # actually convert these strings @@ -1636,15 +1656,16 @@ def infer_datetimelike_array(arr: ndarray[object]) -> str: # convert *every* string array if len(objs): try: - array_to_datetime(objs, errors="raise") - return "datetime" + # require_iso8601 as in maybe_infer_to_datetimelike + array_to_datetime(objs, errors="raise", require_iso8601=True) + return "datetime", seen_str except (ValueError, TypeError): pass # we are *not* going to infer from strings # for timedelta as too much ambiguity - return 'mixed' + return "mixed", seen_str cdef inline bint is_timedelta(object o): @@ -2029,16 +2050,58 @@ cdef bint is_period_array(ndarray[object] values): return True -cdef class IntervalValidator(Validator): - cdef inline bint is_value_typed(self, object value) except -1: - return is_interval(value) - - cpdef bint is_interval_array(ndarray values): + """ + Is this an ndarray of Interval (or np.nan) with a single dtype? + """ cdef: - IntervalValidator validator = IntervalValidator(len(values), - skipna=True) - return validator.validate(values) + Py_ssize_t i, n = len(values) + str closed = None + bint numeric = False + bint dt64 = False + bint td64 = False + object val + + if len(values) == 0: + return False + + for val in values: + if is_interval(val): + if closed is None: + closed = val.closed + numeric = ( + util.is_float_object(val.left) + or util.is_integer_object(val.left) + ) + td64 = is_timedelta(val.left) + dt64 = PyDateTime_Check(val.left) + elif val.closed != closed: + # mismatched closedness + return False + elif numeric: + if not ( + util.is_float_object(val.left) + or util.is_integer_object(val.left) + ): + # i.e. datetime64 or timedelta64 + return False + elif td64: + if not is_timedelta(val.left): + return False + elif dt64: + if not PyDateTime_Check(val.left): + return False + else: + raise ValueError(val) + elif util.is_nan(val) or val is None: + pass + else: + return False + + if closed is None: + # we saw all-NAs, no actual Intervals + return False + return True @cython.boundscheck(False) @@ -2275,7 +2338,9 @@ def maybe_convert_objects(ndarray[object] objects, bint convert_datetime=False, bint convert_timedelta=False, bint convert_period=False, - bint convert_to_nullable_integer=False) -> "ArrayLike": + bint convert_interval=False, + bint convert_to_nullable_integer=False, + object dtype_if_all_nat=None) -> "ArrayLike": """ Type inference function-- convert object array to proper dtype @@ -2298,9 +2363,14 @@ def maybe_convert_objects(ndarray[object] objects, convert_period : bool, default False If an array-like object contains only (homogeneous-freq) Period values or NaT, whether to convert and return a PeriodArray. + convert_interval : bool, default False + If an array-like object contains only Interval objects (with matching + dtypes and closedness) or NaN, whether to convert to IntervalArray. convert_to_nullable_integer : bool, default False If an array-like object contains only integer values (and NaN) is encountered, whether to convert and return an IntegerArray. + dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None + Dtype to cast to if we have all-NaT. Returns ------- @@ -2369,8 +2439,12 @@ def maybe_convert_objects(ndarray[object] objects, seen.float_ = True elif is_timedelta(val): if convert_timedelta: - itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8") seen.timedelta_ = True + try: + itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8") + except OutOfBoundsTimedelta: + seen.object_ = True + break else: seen.object_ = True break @@ -2407,8 +2481,12 @@ def maybe_convert_objects(ndarray[object] objects, break else: seen.datetime_ = True - idatetimes[i] = convert_to_tsobject( - val, None, None, 0, 0).value + try: + idatetimes[i] = convert_to_tsobject( + val, None, None, 0, 0).value + except OutOfBoundsDatetime: + seen.object_ = True + break else: seen.object_ = True break @@ -2428,6 +2506,13 @@ def maybe_convert_objects(ndarray[object] objects, except (ValueError, TypeError): seen.object_ = True break + elif is_interval(val): + if convert_interval: + seen.interval_ = True + break + else: + seen.object_ = True + break else: seen.object_ = True break @@ -2449,6 +2534,17 @@ def maybe_convert_objects(ndarray[object] objects, # unbox to PeriodArray return pi._data + seen.object_ = True + + if seen.interval_: + if is_interval_array(objects): + from pandas import IntervalIndex + ii = IntervalIndex(objects) + + # unbox to IntervalArray + return ii._data + + seen.object_ = True if not seen.object_: result = None @@ -2478,8 +2574,13 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.nat_: if not seen.numeric_: if convert_datetime and convert_timedelta: - # TODO: array full of NaT ambiguity resolve here needed - pass + dtype = dtype_if_all_nat + if dtype is not None: + # otherwise we keep object dtype + result = _infer_all_nats( + dtype, datetimes, timedeltas + ) + elif convert_datetime: result = datetimes elif convert_timedelta: @@ -2518,8 +2619,13 @@ def maybe_convert_objects(ndarray[object] objects, elif seen.nat_: if not seen.numeric_: if convert_datetime and convert_timedelta: - # TODO: array full of NaT ambiguity resolve here needed - pass + dtype = dtype_if_all_nat + if dtype is not None: + # otherwise we keep object dtype + result = _infer_all_nats( + dtype, datetimes, timedeltas + ) + elif convert_datetime: result = datetimes elif convert_timedelta: @@ -2550,6 +2656,26 @@ def maybe_convert_objects(ndarray[object] objects, return objects +cdef _infer_all_nats(dtype, ndarray datetimes, ndarray timedeltas): + """ + If we have all-NaT values, cast these to the given dtype. + """ + if isinstance(dtype, np.dtype): + if dtype == "M8[ns]": + result = datetimes + elif dtype == "m8[ns]": + result = timedeltas + else: + raise ValueError(dtype) + else: + # ExtensionDtype + cls = dtype.construct_array_type() + i8vals = np.empty(len(datetimes), dtype="i8") + i8vals.fill(NPY_NAT) + result = cls(i8vals, dtype=dtype) + return result + + class NoDefault(Enum): # We make this an Enum # 1) because it round-trips through pickle correctly (see GH#40397) @@ -2773,25 +2899,3 @@ def to_object_array_tuples(rows: object) -> np.ndarray: result[i, j] = row[j] return result - - -@cython.wraparound(False) -@cython.boundscheck(False) -def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray: - cdef: - Py_ssize_t i, n = len(keys) - object val - ndarray[object] output = np.empty(n, dtype='O') - - if n == 0: - # kludge, for Series - return np.empty(0, dtype='f8') - - for i in range(n): - val = keys[i] - if val in mapping: - output[i] = mapping[val] - else: - output[i] = default - - return maybe_convert_objects(output) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index b2d548e04eab4..7d7074988e5f0 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -146,6 +146,11 @@ cdef extern from "parser/tokenizer.h": enum: ERROR_OVERFLOW + ctypedef enum BadLineHandleMethod: + ERROR, + WARN, + SKIP + ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors) ctypedef int (*io_cleanup)(void *src) @@ -198,8 +203,7 @@ cdef extern from "parser/tokenizer.h": int usecols int expected_fields - int error_bad_lines - int warn_bad_lines + BadLineHandleMethod on_bad_lines # floating point options char decimal @@ -351,8 +355,7 @@ cdef class TextReader: thousands=None, # bytes | str dtype=None, usecols=None, - bint error_bad_lines=True, - bint warn_bad_lines=True, + on_bad_lines = ERROR, bint na_filter=True, na_values=None, na_fvalues=None, @@ -435,9 +438,7 @@ cdef class TextReader: raise ValueError('Only length-1 comment characters supported') self.parser.commentchar = ord(comment) - # error handling of bad lines - self.parser.error_bad_lines = int(error_bad_lines) - self.parser.warn_bad_lines = int(warn_bad_lines) + self.parser.on_bad_lines = on_bad_lines self.skiprows = skiprows if skiprows is not None: @@ -454,8 +455,7 @@ cdef class TextReader: # XXX if skipfooter > 0: - self.parser.error_bad_lines = 0 - self.parser.warn_bad_lines = 0 + self.parser.on_bad_lines = SKIP self.delimiter = delimiter @@ -570,9 +570,6 @@ cdef class TextReader: kh_destroy_str_starts(self.false_set) self.false_set = NULL - def set_error_bad_lines(self, int status) -> None: - self.parser.error_bad_lines = status - def _set_quoting(self, quote_char: str | bytes | None, quoting: int): if not isinstance(quoting, int): raise TypeError('"quoting" must be an integer') diff --git a/pandas/_libs/src/headers/cmath b/pandas/_libs/src/headers/cmath index 632e1fc2390d0..9e7540cfefc13 100644 --- a/pandas/_libs/src/headers/cmath +++ b/pandas/_libs/src/headers/cmath @@ -25,6 +25,18 @@ namespace std { __inline int isnan(double x) { return _isnan(x); } __inline int notnan(double x) { return x == x; } } +#elif defined(__MVS__) +#include + +#define _signbit signbit +#undef signbit +#undef isnan + +namespace std { + __inline int notnan(double x) { return x == x; } + __inline int signbit(double num) { return _signbit(num); } + __inline int isnan(double x) { return isnan(x); } +} #else #include diff --git a/pandas/_libs/src/parser/tokenizer.c b/pandas/_libs/src/parser/tokenizer.c index 49eb1e7855098..49797eea59ddc 100644 --- a/pandas/_libs/src/parser/tokenizer.c +++ b/pandas/_libs/src/parser/tokenizer.c @@ -93,8 +93,7 @@ void parser_set_default_options(parser_t *self) { self->allow_embedded_newline = 1; self->expected_fields = -1; - self->error_bad_lines = 0; - self->warn_bad_lines = 0; + self->on_bad_lines = ERROR; self->commentchar = '#'; self->thousands = '\0'; @@ -457,7 +456,7 @@ static int end_line(parser_t *self) { self->line_fields[self->lines] = 0; // file_lines is now the actual file line number (starting at 1) - if (self->error_bad_lines) { + if (self->on_bad_lines == ERROR) { self->error_msg = malloc(bufsize); snprintf(self->error_msg, bufsize, "Expected %d fields in line %" PRIu64 ", saw %" PRId64 "\n", @@ -468,7 +467,7 @@ static int end_line(parser_t *self) { return -1; } else { // simply skip bad lines - if (self->warn_bad_lines) { + if (self->on_bad_lines == WARN) { // pass up error message msg = malloc(bufsize); snprintf(msg, bufsize, diff --git a/pandas/_libs/src/parser/tokenizer.h b/pandas/_libs/src/parser/tokenizer.h index f69fee4993d34..623d3690f252a 100644 --- a/pandas/_libs/src/parser/tokenizer.h +++ b/pandas/_libs/src/parser/tokenizer.h @@ -84,6 +84,12 @@ typedef enum { QUOTE_NONE } QuoteStyle; +typedef enum { + ERROR, + WARN, + SKIP +} BadLineHandleMethod; + typedef void *(*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status, const char *encoding_errors); typedef int (*io_cleanup)(void *src); @@ -136,8 +142,7 @@ typedef struct parser_t { int usecols; // Boolean: 1: usecols provided, 0: none provided int expected_fields; - int error_bad_lines; - int warn_bad_lines; + BadLineHandleMethod on_bad_lines; // floating point options char decimal; diff --git a/pandas/_libs/tslibs/nattype.pyi b/pandas/_libs/tslibs/nattype.pyi index 0f81dcb4b2df1..5a2985d0e815b 100644 --- a/pandas/_libs/tslibs/nattype.pyi +++ b/pandas/_libs/tslibs/nattype.pyi @@ -1,8 +1,14 @@ -from datetime import datetime +from datetime import ( + datetime, + timedelta, +) +from typing import Any import numpy as np +from pandas._libs.tslibs.period import Period + NaT: NaTType iNaT: int nat_strings: set[str] @@ -133,3 +139,31 @@ class NaTType(datetime): # inject Period properties @property def qyear(self) -> float: ... + + def __eq__(self, other: Any) -> bool: ... + def __ne__(self, other: Any) -> bool: ... + # https://github.com/python/mypy/issues/9015 + # error: Argument 1 of "__lt__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __lt__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... + # error: Argument 1 of "__le__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __le__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... + # error: Argument 1 of "__gt__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __gt__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... + # error: Argument 1 of "__ge__" is incompatible with supertype "date"; + # supertype defines the argument type as "date" + def __ge__( # type: ignore[override] + self, + other: datetime | timedelta | Period | np.datetime64 | np.timedelta64 + ) -> bool: ... diff --git a/pandas/_libs/tslibs/timezones.pyi b/pandas/_libs/tslibs/timezones.pyi index 04a1b391dc30a..a631191f8b005 100644 --- a/pandas/_libs/tslibs/timezones.pyi +++ b/pandas/_libs/tslibs/timezones.pyi @@ -2,31 +2,22 @@ from datetime import ( datetime, tzinfo, ) -from typing import ( - Callable, - Optional, - Union, -) +from typing import Callable import numpy as np # imported from dateutil.tz dateutil_gettz: Callable[[str], tzinfo] - def tz_standardize(tz: tzinfo) -> tzinfo: ... - -def tz_compare(start: Optional[tzinfo], end: Optional[tzinfo]) -> bool: ... - +def tz_compare(start: tzinfo | None, end: tzinfo | None) -> bool: ... def infer_tzinfo( - start: Optional[datetime], end: Optional[datetime], -) -> Optional[tzinfo]: ... + start: datetime | None, + end: datetime | None, +) -> tzinfo | None: ... # ndarrays returned are both int64_t def get_dst_info(tz: tzinfo) -> tuple[np.ndarray, np.ndarray, str]: ... - -def maybe_get_tz(tz: Optional[Union[str, int, np.int64, tzinfo]]) -> Optional[tzinfo]: ... - -def get_timezone(tz: tzinfo) -> Union[tzinfo, str]: ... - -def is_utc(tz: Optional[tzinfo]) -> bool: ... +def maybe_get_tz(tz: str | int | np.int64 | tzinfo | None) -> tzinfo | None: ... +def get_timezone(tz: tzinfo) -> tzinfo | str: ... +def is_utc(tz: tzinfo | None) -> bool: ... diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi index f47885a2e3306..1cbe55320099b 100644 --- a/pandas/_libs/tslibs/tzconversion.pyi +++ b/pandas/_libs/tslibs/tzconversion.pyi @@ -2,11 +2,7 @@ from datetime import ( timedelta, tzinfo, ) -from typing import ( - Iterable, - Optional, - Union, -) +from typing import Iterable import numpy as np @@ -14,12 +10,10 @@ def tz_convert_from_utc( vals: np.ndarray, # const int64_t[:] tz: tzinfo, ) -> np.ndarray: ... # np.ndarray[np.int64] - def tz_convert_from_utc_single(val: np.int64, tz: tzinfo) -> np.int64: ... - def tz_localize_to_utc( vals: np.ndarray, # np.ndarray[np.int64] - tz: Optional[tzinfo], - ambiguous: Optional[Union[str, bool, Iterable[bool]]] = None, - nonexistent: Optional[Union[str, timedelta, np.timedelta64]] = None, + tz: tzinfo | None, + ambiguous: str | bool | Iterable[bool] | None = None, + nonexistent: str | timedelta | np.timedelta64 | None = None, ) -> np.ndarray: ... # np.ndarray[np.int64] diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi index 6ed1e10ef2353..2a23289cdf61b 100644 --- a/pandas/_libs/tslibs/vectorized.pyi +++ b/pandas/_libs/tslibs/vectorized.pyi @@ -3,10 +3,6 @@ For cython types that cannot be represented precisely, closest-available python equivalents are used, and the precise types kept as adjacent comments. """ from datetime import tzinfo -from typing import ( - Optional, - Union, -) import numpy as np @@ -16,32 +12,24 @@ from pandas._libs.tslibs.offsets import BaseOffset def dt64arr_to_periodarr( stamps: np.ndarray, # const int64_t[:] freq: int, - tz: Optional[tzinfo], + tz: tzinfo | None, ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] - - def is_date_array_normalized( stamps: np.ndarray, # const int64_t[:] - tz: Optional[tzinfo] = None, + tz: tzinfo | None = None, ) -> bool: ... - - def normalize_i8_timestamps( stamps: np.ndarray, # const int64_t[:] - tz: Optional[tzinfo], + tz: tzinfo | None, ) -> np.ndarray: ... # np.ndarray[np.int64] - - def get_resolution( stamps: np.ndarray, # const int64_t[:] - tz: Optional[tzinfo] = None, + tz: tzinfo | None = None, ) -> Resolution: ... - - def ints_to_pydatetime( arr: np.ndarray, # const int64_t[:}] - tz: Optional[tzinfo] = None, - freq: Optional[Union[str, BaseOffset]] = None, + tz: tzinfo | None = None, + freq: str | BaseOffset | None = None, fold: bool = False, box: str = "datetime", ) -> np.ndarray: ... # np.ndarray[object] diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 8d64bf8852946..369832e9bc05c 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -92,6 +92,18 @@ def is_platform_mac() -> bool: return sys.platform == "darwin" +def is_platform_arm() -> bool: + """ + Checking if he running platform use ARM architecture. + + Returns + ------- + bool + True if the running platform uses ARM architecture. + """ + return platform.machine() in ("arm64", "aarch64") + + def import_lzma(): """ Importing the `lzma` module. diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index 2c184c38e6b1a..941c59592dbbd 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -22,11 +22,11 @@ "openpyxl": "3.0.0", "pandas_gbq": "0.12.0", "pyarrow": "0.17.0", - "pytest": "5.0.1", + "pytest": "6.0", "pyxlsb": "1.0.6", "s3fs": "0.4.0", "scipy": "1.2.0", - "sqlalchemy": "1.2.8", + "sqlalchemy": "1.3.0", "tables": "3.5.1", "tabulate": "0.8.7", "xarray": "0.12.3", diff --git a/pandas/compat/numpy/function.py b/pandas/compat/numpy/function.py index 63ea5554e32d7..69dc3ac417510 100644 --- a/pandas/compat/numpy/function.py +++ b/pandas/compat/numpy/function.py @@ -22,10 +22,7 @@ Union, ) -from numpy import ( - __version__, - ndarray, -) +from numpy import ndarray from pandas._libs.lib import ( is_bool, @@ -38,8 +35,6 @@ validate_kwargs, ) -from pandas.util.version import Version - class CompatValidator: def __init__( @@ -128,10 +123,7 @@ def validate_argmax_with_skipna(skipna, args, kwargs): ARGSORT_DEFAULTS["axis"] = -1 ARGSORT_DEFAULTS["kind"] = "quicksort" ARGSORT_DEFAULTS["order"] = None - -if Version(__version__) >= Version("1.17.0"): - # GH-26361. NumPy added radix sort and changed default to None. - ARGSORT_DEFAULTS["kind"] = None +ARGSORT_DEFAULTS["kind"] = None validate_argsort = CompatValidator( diff --git a/pandas/conftest.py b/pandas/conftest.py index f948dc11bc014..329023ed7ba6a 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -66,6 +66,11 @@ MultiIndex, ) +# Until https://github.com/numpy/numpy/issues/19078 is sorted out, just suppress +suppress_npdev_promotion_warning = pytest.mark.filterwarnings( + "ignore:Promotion of numbers and bools:FutureWarning" +) + # ---------------------------------------------------------------- # Configuration / Settings # ---------------------------------------------------------------- @@ -112,6 +117,8 @@ def pytest_collection_modifyitems(items): if "/frame/" in item.nodeid: item.add_marker(pytest.mark.arraymanager) + item.add_marker(suppress_npdev_promotion_warning) + # Hypothesis hypothesis.settings.register_profile( diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f8f5e5e05bc35..30f42435ad177 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1266,14 +1266,14 @@ def compute(self, method: str) -> Series: return dropped.sort_values(ascending=ascending).head(n) # fast method - arr, pandas_dtype = _ensure_data(dropped.values) + arr, new_dtype = _ensure_data(dropped.values) if method == "nlargest": arr = -arr - if is_integer_dtype(pandas_dtype): + if is_integer_dtype(new_dtype): # GH 21426: ensure reverse ordering at boundaries arr -= 1 - elif is_bool_dtype(pandas_dtype): + elif is_bool_dtype(new_dtype): # GH 26154: ensure False is smaller than True arr = 1 - (-arr) diff --git a/pandas/core/array_algos/quantile.py b/pandas/core/array_algos/quantile.py index efa36a5bd3ae9..32c50ed38eba0 100644 --- a/pandas/core/array_algos/quantile.py +++ b/pandas/core/array_algos/quantile.py @@ -37,7 +37,18 @@ def quantile_compat(values: ArrayLike, qs: np.ndarray, interpolation: str) -> Ar mask = isna(values) return _quantile_with_mask(values, mask, fill_value, qs, interpolation) else: - return _quantile_ea_compat(values, qs, interpolation) + # In general we don't want to import from arrays here; + # this is temporary pending discussion in GH#41428 + from pandas.core.arrays import BaseMaskedArray + + if isinstance(values, BaseMaskedArray): + # e.g. IntegerArray, does not implement _from_factorized + out = _quantile_ea_fallback(values, qs, interpolation) + + else: + out = _quantile_ea_compat(values, qs, interpolation) + + return out def _quantile_with_mask( @@ -144,3 +155,31 @@ def _quantile_ea_compat( # error: Incompatible return value type (got "ndarray", expected "ExtensionArray") return result # type: ignore[return-value] + + +def _quantile_ea_fallback( + values: ExtensionArray, qs: np.ndarray, interpolation: str +) -> ExtensionArray: + """ + quantile compatibility for ExtensionArray subclasses that do not + implement `_from_factorized`, e.g. IntegerArray. + + Notes + ----- + We assume that all impacted cases are 1D-only. + """ + mask = np.atleast_2d(np.asarray(values.isna())) + npvalues = np.atleast_2d(np.asarray(values)) + + res = _quantile_with_mask( + npvalues, + mask=mask, + fill_value=values.dtype.na_value, + qs=qs, + interpolation=interpolation, + ) + assert res.ndim == 2 + assert res.shape[0] == 1 + res = res[0] + out = type(values)._from_sequence(res, dtype=values.dtype) + return out diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 7c76a04a605e3..0e8097cf1fc78 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -94,7 +94,7 @@ def take( axis: int = 0, ) -> NDArrayBackedExtensionArrayT: if allow_fill: - fill_value = self._validate_fill_value(fill_value) + fill_value = self._validate_scalar(fill_value) new_data = take( self._ndarray, @@ -107,25 +107,6 @@ def take( ) return self._from_backing_data(new_data) - def _validate_fill_value(self, fill_value): - """ - If a fill_value is passed to `take` convert it to a representation - suitable for self._ndarray, raising TypeError if this is not possible. - - Parameters - ---------- - fill_value : object - - Returns - ------- - fill_value : native representation - - Raises - ------ - TypeError - """ - raise AbstractMethodError(self) - # ------------------------------------------------------------------------ def equals(self, other) -> bool: @@ -194,7 +175,7 @@ def shift(self, periods=1, fill_value=None, axis=0): def _validate_shift_value(self, fill_value): # TODO: after deprecation in datetimelikearraymixin is enforced, # we can remove this and ust validate_fill_value directly - return self._validate_fill_value(fill_value) + return self._validate_scalar(fill_value) def __setitem__(self, key, value): key = check_array_indexer(self, key) @@ -346,6 +327,36 @@ def where( res_values = np.where(mask, self._ndarray, value) return self._from_backing_data(res_values) + # ------------------------------------------------------------------------ + # Index compat methods + + def insert( + self: NDArrayBackedExtensionArrayT, loc: int, item + ) -> NDArrayBackedExtensionArrayT: + """ + Make new ExtensionArray inserting new item at location. Follows + Python list.append semantics for negative values. + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + type(self) + """ + code = self._validate_scalar(item) + + new_vals = np.concatenate( + ( + self._ndarray[:loc], + np.asarray([code], dtype=self._ndarray.dtype), + self._ndarray[loc:], + ) + ) + return self._from_backing_data(new_vals) + # ------------------------------------------------------------------------ # Additional array methods # These are not part of the EA API, but we implement them because diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index cb8a08f5668ac..068f5703649fa 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -439,12 +439,6 @@ def __init__( "explicitly specify the categories order " "by passing in a categories argument." ) from err - except ValueError as err: - - # TODO(EA2D) - raise NotImplementedError( - "> 1 ndim Categorical are not supported at this time" - ) from err # we're inferring from values dtype = CategoricalDtype(categories, dtype.ordered) @@ -1413,7 +1407,7 @@ def _validate_searchsorted_value(self, value): codes = np.array(locs, dtype=self.codes.dtype) # type: ignore[assignment] return codes - def _validate_fill_value(self, fill_value): + def _validate_scalar(self, fill_value): """ Convert a user-facing fill_value to a representation to use with our underlying ndarray, raising TypeError if this is not possible. @@ -1442,8 +1436,6 @@ def _validate_fill_value(self, fill_value): ) return fill_value - _validate_scalar = _validate_fill_value - # ------------------------------------------------------------- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: @@ -2453,7 +2445,9 @@ def replace(self, to_replace, value, inplace: bool = False): # ------------------------------------------------------------------------ # String methods interface - def _str_map(self, f, na_value=np.nan, dtype=np.dtype("object")): + def _str_map( + self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True + ): # Optimization to apply the callable `f` to the categories once # and rebuild the result by `take`ing from the result with the codes. # Returns the same type as the object-dtype implementation though. diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index ff46715d0a527..ba5be03b93490 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -557,27 +557,8 @@ def _validate_comparison_value(self, other): return other - def _validate_fill_value(self, fill_value): - """ - If a fill_value is passed to `take` convert it to an i8 representation, - raising TypeError if this is not possible. - - Parameters - ---------- - fill_value : object - - Returns - ------- - fill_value : np.int64, np.datetime64, or np.timedelta64 - - Raises - ------ - TypeError - """ - return self._validate_scalar(fill_value) - def _validate_shift_value(self, fill_value): - # TODO(2.0): once this deprecation is enforced, use _validate_fill_value + # TODO(2.0): once this deprecation is enforced, use _validate_scalar if is_valid_na_for_dtype(fill_value, self.dtype): fill_value = NaT elif isinstance(fill_value, self._recognized_scalars): diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index aee0d4fecd6ae..020f708606353 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -1119,14 +1119,14 @@ def to_period(self, freq=None) -> PeriodArray: ... "2000-08-31 00:00:00"])) >>> df.index.to_period("M") PeriodIndex(['2000-03', '2000-05', '2000-08'], - dtype='period[M]', freq='M') + dtype='period[M]') Infer the daily frequency >>> idx = pd.date_range("2017-01-01", periods=2) >>> idx.to_period() PeriodIndex(['2017-01-01', '2017-01-02'], - dtype='period[D]', freq='D') + dtype='period[D]') """ from pandas.core.arrays import PeriodArray @@ -2104,7 +2104,6 @@ def sequence_to_dt64ns( result = data.view(DT64NS_DTYPE) if copy: - # TODO: should this be deepcopy? result = result.copy() assert isinstance(result, np.ndarray), type(result) diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index a99bf245a6073..8836695efcbcb 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -32,7 +32,6 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender -from pandas.core.dtypes.cast import maybe_convert_platform from pandas.core.dtypes.common import ( is_categorical_dtype, is_datetime64_dtype, @@ -802,7 +801,7 @@ def fillna( if limit is not None: raise TypeError("limit is not supported for IntervalArray.") - value_left, value_right = self._validate_fill_value(value) + value_left, value_right = self._validate_scalar(value) left = self.left.fillna(value=value_left) right = self.right.fillna(value=value_right) @@ -1001,7 +1000,7 @@ def take( fill_left = fill_right = fill_value if allow_fill: - fill_left, fill_right = self._validate_fill_value(fill_value) + fill_left, fill_right = self._validate_scalar(fill_value) left_take = take( self._left, indices, allow_fill=allow_fill, fill_value=fill_left @@ -1038,6 +1037,7 @@ def _validate_scalar(self, value): if isinstance(value, Interval): self._check_closed_matches(value, name="value") left, right = value.left, value.right + # TODO: check subdtype match like _validate_setitem_value? elif is_valid_na_for_dtype(value, self.left.dtype): # GH#18295 left = right = value @@ -1047,9 +1047,6 @@ def _validate_scalar(self, value): ) return left, right - def _validate_fill_value(self, value): - return self._validate_scalar(value) - def _validate_setitem_value(self, value): needs_float_conversion = False @@ -1650,4 +1647,6 @@ def _maybe_convert_platform_interval(values) -> ArrayLike: else: values = extract_array(values, extract_numpy=True) - return maybe_convert_platform(values) + if not hasattr(values, "dtype"): + return np.asarray(values) + return values diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index e9d554200805e..dc592f205b3ea 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -190,7 +190,7 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs): def isna(self) -> np.ndarray: return isna(self._ndarray) - def _validate_fill_value(self, fill_value): + def _validate_scalar(self, fill_value): if fill_value is None: # Primarily for subclasses fill_value = self.dtype.na_value diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 101209be30b40..d8c1b9cef468a 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -564,15 +564,15 @@ def asfreq(self, freq=None, how: str = "E") -> PeriodArray: >>> pidx = pd.period_range('2010-01-01', '2015-01-01', freq='A') >>> pidx PeriodIndex(['2010', '2011', '2012', '2013', '2014', '2015'], - dtype='period[A-DEC]', freq='A-DEC') + dtype='period[A-DEC]') >>> pidx.asfreq('M') PeriodIndex(['2010-12', '2011-12', '2012-12', '2013-12', '2014-12', - '2015-12'], dtype='period[M]', freq='M') + '2015-12'], dtype='period[M]') >>> pidx.asfreq('M', how='S') PeriodIndex(['2010-01', '2011-01', '2012-01', '2013-01', '2014-01', - '2015-01'], dtype='period[M]', freq='M') + '2015-01'], dtype='period[M]') """ how = libperiod.validate_end_alias(how) @@ -866,7 +866,7 @@ def start_time(self) -> DatetimeArray: def end_time(self) -> DatetimeArray: return self.to_timestamp(how="end") - def _require_matching_freq(self, other, base=False): + def _require_matching_freq(self, other, base: bool = False) -> None: # See also arrays.period.raise_on_incompatible if isinstance(other, BaseOffset): other_freq = other @@ -1057,7 +1057,7 @@ def dt64arr_to_periodarr(data, freq, tz=None): Returns ------- - ordinals : ndarray[int] + ordinals : ndarray[int64] freq : Tick The frequency extracted from the Series or DatetimeIndex if that's used. diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 4847372f18239..6ab296b314615 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1397,7 +1397,7 @@ def max(self, axis=0, *args, **kwargs): # This condition returns a nan if there are no valid values in the array. if self.size > 0 and self._valid_sp_values.size == 0: - return np.nan + return self.fill_value else: return np.nanmax(self, axis) @@ -1406,7 +1406,7 @@ def min(self, axis=0, *args, **kwargs): # This condition returns a nan if there are no valid values in the array. if self.size > 0 and self._valid_sp_values.size == 0: - return np.nan + return self.fill_value else: return np.nanmin(self, axis) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 74ca5130ca322..ab1dadf4d2dfa 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -410,7 +410,9 @@ def _cmp_method(self, other, op): # String methods interface _str_na_value = StringDtype.na_value - def _str_map(self, f, na_value=None, dtype: Dtype | None = None): + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): from pandas.arrays import BooleanArray if dtype is None: diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d5ee28eb7017e..3cf471e381da9 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -1,5 +1,6 @@ from __future__ import annotations +from collections.abc import Callable # noqa: PDF001 import re from typing import ( TYPE_CHECKING, @@ -22,6 +23,7 @@ type_t, ) from pandas.compat import ( + pa_version_under1p0, pa_version_under2p0, pa_version_under3p0, pa_version_under4p0, @@ -29,14 +31,17 @@ from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.common import ( is_array_like, is_bool_dtype, + is_dtype_equal, is_integer, is_integer_dtype, is_object_dtype, is_scalar, is_string_dtype, + pandas_dtype, ) from pandas.core.dtypes.dtypes import register_extension_dtype from pandas.core.dtypes.missing import isna @@ -46,39 +51,41 @@ from pandas.core.arrays.base import ExtensionArray from pandas.core.arrays.boolean import BooleanDtype from pandas.core.arrays.integer import Int64Dtype +from pandas.core.arrays.numeric import NumericDtype from pandas.core.arrays.string_ import StringDtype from pandas.core.indexers import ( check_array_indexer, validate_indices, ) from pandas.core.strings.object_array import ObjectStringArrayMixin -from pandas.util.version import Version -try: +# PyArrow backed StringArrays are available starting at 1.0.0, but this +# file is imported from even if pyarrow is < 1.0.0, before pyarrow.compute +# and its compute functions existed. GH38801 +if not pa_version_under1p0: import pyarrow as pa -except ImportError: - pa = None -else: - # PyArrow backed StringArrays are available starting at 1.0.0, but this - # file is imported from even if pyarrow is < 1.0.0, before pyarrow.compute - # and its compute functions existed. GH38801 - if Version(pa.__version__) >= Version("1.0.0"): - import pyarrow.compute as pc - - ARROW_CMP_FUNCS = { - "eq": pc.equal, - "ne": pc.not_equal, - "lt": pc.less, - "gt": pc.greater, - "le": pc.less_equal, - "ge": pc.greater_equal, - } + import pyarrow.compute as pc + + ARROW_CMP_FUNCS = { + "eq": pc.equal, + "ne": pc.not_equal, + "lt": pc.less, + "gt": pc.greater, + "le": pc.less_equal, + "ge": pc.greater_equal, + } if TYPE_CHECKING: from pandas import Series +def _chk_pyarrow_available() -> None: + if pa_version_under1p0: + msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." + raise ImportError(msg) + + @register_extension_dtype class ArrowStringDtype(StringDtype): """ @@ -111,6 +118,9 @@ class ArrowStringDtype(StringDtype): #: StringDtype.na_value uses pandas.NA na_value = libmissing.NA + def __init__(self): + _chk_pyarrow_available() + @property def type(self) -> type[str]: return str @@ -212,10 +222,8 @@ class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin): Length: 4, dtype: arrow_string """ - _dtype = ArrowStringDtype() - def __init__(self, values): - self._chk_pyarrow_available() + self._dtype = ArrowStringDtype() if isinstance(values, pa.Array): self._data = pa.chunked_array([values]) elif isinstance(values, pa.ChunkedArray): @@ -228,19 +236,11 @@ def __init__(self, values): "ArrowStringArray requires a PyArrow (chunked) array of string type" ) - @classmethod - def _chk_pyarrow_available(cls) -> None: - # TODO: maybe update import_optional_dependency to allow a minimum - # version to be specified rather than use the global minimum - if pa is None or Version(pa.__version__) < Version("1.0.0"): - msg = "pyarrow>=1.0.0 is required for PyArrow backed StringArray." - raise ImportError(msg) - @classmethod def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False): from pandas.core.arrays.masked import BaseMaskedArray - cls._chk_pyarrow_available() + _chk_pyarrow_available() if isinstance(scalars, BaseMaskedArray): # avoid costly conversion to object dtype in ensure_string_array and @@ -289,10 +289,14 @@ def to_numpy( # type: ignore[override] """ # TODO: copy argument is ignored - if na_value is lib.no_default: - na_value = self._dtype.na_value - result = self._data.__array__(dtype=dtype) - result[isna(result)] = na_value + result = np.array(self._data, dtype=dtype) + if self._data.null_count > 0: + if na_value is lib.no_default: + if dtype and np.issubdtype(dtype, np.floating): + return result + na_value = self._dtype.na_value + mask = self.isna() + result[mask] = na_value return result def __len__(self) -> int: @@ -736,12 +740,32 @@ def value_counts(self, dropna: bool = True) -> Series: return Series(counts, index=index).astype("Int64") + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + + if is_dtype_equal(dtype, self.dtype): + if copy: + return self.copy() + return self + + elif isinstance(dtype, NumericDtype): + data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype)) + return dtype.__from_arrow__(data) + + elif isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + return cls._from_sequence(self, dtype=dtype, copy=copy) + + return super().astype(dtype, copy) + # ------------------------------------------------------------------------ # String methods interface _str_na_value = ArrowStringDtype.na_value - def _str_map(self, f, na_value=None, dtype: Dtype | None = None): + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): # TODO: de-duplicate with StringArray method. This method is moreless copy and # paste. @@ -834,6 +858,28 @@ def _str_endswith(self, pat: str, na=None): pat = re.escape(pat) + "$" return self._str_contains(pat, na=na, regex=True) + def _str_replace( + self, + pat: str | re.Pattern, + repl: str | Callable, + n: int = -1, + case: bool = True, + flags: int = 0, + regex: bool = True, + ): + if ( + pa_version_under4p0 + or isinstance(pat, re.Pattern) + or callable(repl) + or not case + or flags + ): + return super()._str_replace(pat, repl, n, case, flags, regex) + + func = pc.replace_substring_regex if regex else pc.replace_substring + result = func(self._data, pattern=pat, replacement=repl, max_replacements=n) + return type(self)(result) + def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None ): diff --git a/pandas/core/common.py b/pandas/core/common.py index 04ff2d2c4618f..c0e44a437f59e 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -142,11 +142,8 @@ def is_bool_indexer(key: Any) -> bool: elif is_bool_dtype(key.dtype): return True elif isinstance(key, list): - try: - arr = np.asarray(key) - return arr.dtype == np.bool_ and len(arr) == len(key) - except TypeError: # pragma: no cover - return False + arr = np.asarray(key) + return arr.dtype == np.bool_ and len(arr) == len(key) return False diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 51b9ed5fd22c7..edaa53cd55042 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -12,15 +12,12 @@ Sequence, cast, ) +import warnings import numpy as np import numpy.ma as ma from pandas._libs import lib -from pandas._libs.tslibs import ( - IncompatibleFrequency, - OutOfBoundsDatetime, -) from pandas._typing import ( AnyArrayLike, ArrayLike, @@ -39,6 +36,7 @@ maybe_cast_to_datetime, maybe_cast_to_integer_array, maybe_convert_platform, + maybe_infer_to_datetimelike, maybe_upcast, sanitize_to_nanoseconds, ) @@ -291,9 +289,9 @@ def array( IntegerArray, IntervalArray, PandasArray, + PeriodArray, StringArray, TimedeltaArray, - period_array, ) if lib.is_scalar(data): @@ -317,19 +315,10 @@ def array( if dtype is None: inferred_dtype = lib.infer_dtype(data, skipna=True) if inferred_dtype == "period": - try: - return period_array(data, copy=copy) - except IncompatibleFrequency: - # We may have a mixture of frequencies. - # We choose to return an ndarray, rather than raising. - pass + return PeriodArray._from_sequence(data, copy=copy) + elif inferred_dtype == "interval": - try: - return IntervalArray(data, copy=copy) - except ValueError: - # We may have a mixture of `closed` here. - # We choose to return an ndarray, rather than raising. - pass + return IntervalArray(data, copy=copy) elif inferred_dtype.startswith("datetime"): # datetime, datetime64 @@ -467,6 +456,8 @@ def sanitize_array( dtype: DtypeObj | None = None, copy: bool = False, raise_cast_failure: bool = True, + *, + allow_2d: bool = False, ) -> ArrayLike: """ Sanitize input data to an ndarray or ExtensionArray, copy if specified, @@ -479,6 +470,8 @@ def sanitize_array( dtype : np.dtype, ExtensionDtype, or None, default None copy : bool, default False raise_cast_failure : bool, default True + allow_2d : bool, default False + If False, raise if we have a 2D Arraylike. Returns ------- @@ -502,7 +495,7 @@ def sanitize_array( data = lib.item_from_zerodim(data) elif isinstance(data, range): # GH#16804 - data = np.arange(data.start, data.stop, data.step, dtype="int64") + data = range_to_ndarray(data) copy = False if not is_list_like(data): @@ -546,13 +539,13 @@ def sanitize_array( if dtype is not None or len(data) == 0: subarr = _try_cast(data, dtype, copy, raise_cast_failure) else: + # TODO: copy? subarr = maybe_convert_platform(data) - # error: Incompatible types in assignment (expression has type - # "Union[ExtensionArray, ndarray, List[Any]]", variable has type - # "ExtensionArray") - subarr = maybe_cast_to_datetime(subarr, dtype) # type: ignore[assignment] + if subarr.dtype == object: + subarr = cast(np.ndarray, subarr) + subarr = maybe_infer_to_datetimelike(subarr) - subarr = _sanitize_ndim(subarr, data, dtype, index) + subarr = _sanitize_ndim(subarr, data, dtype, index, allow_2d=allow_2d) if not ( isinstance(subarr.dtype, ExtensionDtype) or isinstance(dtype, ExtensionDtype) @@ -569,8 +562,32 @@ def sanitize_array( return subarr +def range_to_ndarray(rng: range) -> np.ndarray: + """ + Cast a range object to ndarray. + """ + # GH#30171 perf avoid realizing range as a list in np.array + try: + arr = np.arange(rng.start, rng.stop, rng.step, dtype="int64") + except OverflowError: + # GH#30173 handling for ranges that overflow int64 + if (rng.start >= 0 and rng.step > 0) or (rng.stop >= 0 and rng.step < 0): + try: + arr = np.arange(rng.start, rng.stop, rng.step, dtype="uint64") + except OverflowError: + arr = construct_1d_object_array_from_listlike(list(rng)) + else: + arr = construct_1d_object_array_from_listlike(list(rng)) + return arr + + def _sanitize_ndim( - result: ArrayLike, data, dtype: DtypeObj | None, index: Index | None + result: ArrayLike, + data, + dtype: DtypeObj | None, + index: Index | None, + *, + allow_2d: bool = False, ) -> ArrayLike: """ Ensure we have a 1-dimensional result array. @@ -584,13 +601,13 @@ def _sanitize_ndim( elif result.ndim > 1: if isinstance(data, np.ndarray): + if allow_2d: + return result raise ValueError("Data must be 1-dimensional") if is_object_dtype(dtype) and isinstance(dtype, ExtensionDtype): # i.e. PandasDtype("O") - # error: Argument "dtype" to "asarray_tuplesafe" has incompatible type - # "Type[object]"; expected "Union[str, dtype[Any], None]" - result = com.asarray_tuplesafe(data, dtype=object) # type: ignore[arg-type] + result = com.asarray_tuplesafe(data, dtype=np.dtype("object")) cls = dtype.construct_array_type() result = cls._from_sequence(result, dtype=dtype) else: @@ -658,31 +675,36 @@ def _try_cast( """ is_ndarray = isinstance(arr, np.ndarray) - # perf shortcut as this is the most common case - # Item "List[Any]" of "Union[List[Any], ndarray]" has no attribute "dtype" - if ( - is_ndarray - and arr.dtype != object # type: ignore[union-attr] - and not copy - and dtype is None - ): - # Argument 1 to "sanitize_to_nanoseconds" has incompatible type - # "Union[List[Any], ndarray]"; expected "ndarray" - return sanitize_to_nanoseconds(arr) # type: ignore[arg-type] + if dtype is None: + # perf shortcut as this is the most common case + if is_ndarray: + arr = cast(np.ndarray, arr) + if arr.dtype != object: + return sanitize_to_nanoseconds(arr, copy=copy) + + out = maybe_infer_to_datetimelike(arr) + if out is arr and copy: + out = out.copy() + return out - if isinstance(dtype, ExtensionDtype): + else: + # i.e. list + varr = np.array(arr, copy=False) + # filter out cases that we _dont_ want to go through + # maybe_infer_to_datetimelike + if varr.dtype != object or varr.size == 0: + return varr + return maybe_infer_to_datetimelike(varr) + + elif isinstance(dtype, ExtensionDtype): # create an extension array from its dtype - # DatetimeTZ case needs to go through maybe_cast_to_datetime but - # SparseDtype does not if isinstance(dtype, DatetimeTZDtype): # We can't go through _from_sequence because it handles dt64naive # data differently; _from_sequence treats naive as wall times, # while maybe_cast_to_datetime treats it as UTC # see test_maybe_promote_any_numpy_dtype_with_datetimetz - # error: Incompatible return value type (got "Union[ExtensionArray, - # ndarray, List[Any]]", expected "Union[ExtensionArray, ndarray]") - return maybe_cast_to_datetime(arr, dtype) # type: ignore[return-value] + return maybe_cast_to_datetime(arr, dtype) # TODO: copy? array_type = dtype.construct_array_type()._from_sequence @@ -695,14 +717,8 @@ def _try_cast( return subarr return ensure_wrapped_if_datetimelike(arr).astype(dtype, copy=copy) - elif dtype is None and not is_ndarray: - # filter out cases that we _dont_ want to go through maybe_cast_to_datetime - varr = np.array(arr, copy=False) - if varr.dtype != object or varr.size == 0: - return varr - # error: Incompatible return value type (got "Union[ExtensionArray, - # ndarray, List[Any]]", expected "Union[ExtensionArray, ndarray]") - return maybe_cast_to_datetime(varr, None) # type: ignore[return-value] + elif dtype.kind in ["m", "M"]: + return maybe_cast_to_datetime(arr, dtype) try: # GH#15832: Check if we are requesting a numeric dtype and @@ -710,26 +726,32 @@ def _try_cast( if is_integer_dtype(dtype): # this will raise if we have e.g. floats - dtype = cast(np.dtype, dtype) maybe_cast_to_integer_array(arr, dtype) subarr = arr else: - subarr = maybe_cast_to_datetime(arr, dtype) - if dtype is not None and dtype.kind == "M": - return subarr + subarr = arr if not isinstance(subarr, ABCExtensionArray): + # 4 tests fail if we move this to a try/except/else; see + # test_constructor_compound_dtypes, test_constructor_cast_failure + # test_constructor_dict_cast2, test_loc_setitem_dtype subarr = construct_1d_ndarray_preserving_na(subarr, dtype, copy=copy) - except OutOfBoundsDatetime: - # in case of out of bound datetime64 -> always raise - raise - except (ValueError, TypeError) as err: - if dtype is not None and raise_cast_failure: - raise - elif "Cannot cast" in str(err): - # via _disallow_mismatched_datetimelike + + except (ValueError, TypeError): + if raise_cast_failure: raise else: + # we only get here with raise_cast_failure False, which means + # called via the DataFrame constructor + # GH#24435 + warnings.warn( + f"Could not cast to {dtype}, falling back to object. This " + "behavior is deprecated. In a future version, when a dtype is " + "passed to 'DataFrame', either all columns will be cast to that " + "dtype, or a TypeError will be raised", + FutureWarning, + stacklevel=7, + ) subarr = np.array(arr, dtype=object, copy=copy) return subarr diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index e3616bc857140..161572f3f1ac3 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -68,7 +68,6 @@ is_numeric_dtype, is_object_dtype, is_scalar, - is_sparse, is_string_dtype, is_timedelta64_dtype, is_unsigned_integer_dtype, @@ -124,9 +123,8 @@ def maybe_convert_platform( arr = values if arr.dtype == object: - # error: Argument 1 to "maybe_convert_objects" has incompatible type - # "Union[ExtensionArray, ndarray]"; expected "ndarray" - arr = lib.maybe_convert_objects(arr) # type: ignore[arg-type] + arr = cast(np.ndarray, arr) + arr = lib.maybe_convert_objects(arr) return arr @@ -782,22 +780,6 @@ def infer_dtype_from_scalar(val, pandas_dtype: bool = False) -> tuple[DtypeObj, return dtype, val -def dict_compat(d: dict[Scalar, Scalar]) -> dict[Scalar, Scalar]: - """ - Convert datetimelike-keyed dicts to a Timestamp-keyed dict. - - Parameters - ---------- - d: dict-like object - - Returns - ------- - dict - - """ - return {maybe_box_datetimelike(key): value for key, value in d.items()} - - def infer_dtype_from_array( arr, pandas_dtype: bool = False ) -> tuple[DtypeObj, ArrayLike]: @@ -1250,13 +1232,12 @@ def astype_array(values: ArrayLike, dtype: DtypeObj, copy: bool = False) -> Arra return values.copy() return values - if isinstance(values, ABCExtensionArray): + if not isinstance(values, np.ndarray): + # i.e. ExtensionArray values = values.astype(dtype, copy=copy) else: - # error: Argument 1 to "astype_nansafe" has incompatible type "ExtensionArray"; - # expected "ndarray" - values = astype_nansafe(values, dtype, copy=copy) # type: ignore[arg-type] + values = astype_nansafe(values, dtype, copy=copy) # in pandas we don't store numpy str dtypes, so convert to object if isinstance(dtype, np.dtype) and issubclass(values.dtype.type, str): @@ -1386,7 +1367,7 @@ def convert_dtypes( convert_integer: bool = True, convert_boolean: bool = True, convert_floating: bool = True, -) -> Dtype: +) -> DtypeObj: """ Convert objects to best possible type, and optionally, to types supporting ``pd.NA``. @@ -1407,23 +1388,28 @@ def convert_dtypes( Returns ------- - str, np.dtype, or ExtensionDtype - dtype - new dtype + np.dtype, or ExtensionDtype """ - inferred_dtype: str | np.dtype | ExtensionDtype - # TODO: rule out str + inferred_dtype: str | DtypeObj if ( convert_string or convert_integer or convert_boolean or convert_floating ) and isinstance(input_array, np.ndarray): - inferred_dtype = lib.infer_dtype(input_array) - if not convert_string and is_string_dtype(inferred_dtype): + if is_object_dtype(input_array.dtype): + inferred_dtype = lib.infer_dtype(input_array) + else: inferred_dtype = input_array.dtype + if is_string_dtype(inferred_dtype): + if not convert_string: + inferred_dtype = input_array.dtype + else: + inferred_dtype = pandas_dtype("string") + return inferred_dtype + if convert_integer: - target_int_dtype = "Int64" + target_int_dtype = pandas_dtype("Int64") if is_integer_dtype(input_array.dtype): from pandas.core.arrays.integer import INT_STR_TO_DTYPE @@ -1431,14 +1417,13 @@ def convert_dtypes( inferred_dtype = INT_STR_TO_DTYPE.get( input_array.dtype.name, target_int_dtype ) - if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( - input_array.dtype - ): - inferred_dtype = target_int_dtype - - else: - if is_integer_dtype(inferred_dtype): - inferred_dtype = input_array.dtype + elif is_numeric_dtype(input_array.dtype): + # TODO: de-dup with maybe_cast_to_integer_array? + arr = input_array[notna(input_array)] + if (arr.astype(int) == arr).all(): + inferred_dtype = target_int_dtype + else: + inferred_dtype = input_array.dtype if convert_floating: if not is_integer_dtype(input_array.dtype) and is_numeric_dtype( @@ -1446,32 +1431,33 @@ def convert_dtypes( ): from pandas.core.arrays.floating import FLOAT_STR_TO_DTYPE - inferred_float_dtype = FLOAT_STR_TO_DTYPE.get( - input_array.dtype.name, "Float64" + inferred_float_dtype: DtypeObj = FLOAT_STR_TO_DTYPE.get( + input_array.dtype.name, pandas_dtype("Float64") ) # if we could also convert to integer, check if all floats # are actually integers if convert_integer: + # TODO: de-dup with maybe_cast_to_integer_array? arr = input_array[notna(input_array)] if (arr.astype(int) == arr).all(): - inferred_dtype = "Int64" + inferred_dtype = pandas_dtype("Int64") else: inferred_dtype = inferred_float_dtype else: inferred_dtype = inferred_float_dtype - else: - if is_float_dtype(inferred_dtype): - inferred_dtype = input_array.dtype if convert_boolean: if is_bool_dtype(input_array.dtype): - inferred_dtype = "boolean" - else: - if isinstance(inferred_dtype, str) and inferred_dtype == "boolean": - inferred_dtype = input_array.dtype + inferred_dtype = pandas_dtype("boolean") + elif isinstance(inferred_dtype, str) and inferred_dtype == "boolean": + inferred_dtype = pandas_dtype("boolean") + + if isinstance(inferred_dtype, str): + # If we couldn't do anything else, then we retain the dtype + inferred_dtype = input_array.dtype else: - inferred_dtype = input_array.dtype + return input_array.dtype return inferred_dtype @@ -1541,7 +1527,7 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: else: return td_values.reshape(shape) - inferred_type = lib.infer_datetimelike_array(ensure_object(v)) + inferred_type, seen_str = lib.infer_datetimelike_array(ensure_object(v)) if inferred_type == "datetime": # error: Incompatible types in assignment (expression has type "ExtensionArray", @@ -1570,12 +1556,21 @@ def try_timedelta(v: np.ndarray) -> np.ndarray: # "ExtensionArray", variable has type "Union[ndarray, List[Any]]") value = try_datetime(v) # type: ignore[assignment] + if value.dtype.kind in ["m", "M"] and seen_str: + warnings.warn( + f"Inferring {value.dtype} from data containing strings is deprecated " + "and will be removed in a future version. To retain the old behavior " + "explicitly pass Series(data, dtype={value.dtype})", + FutureWarning, + stacklevel=find_stack_level(), + ) + # return v.reshape(shape) return value def maybe_cast_to_datetime( value: ExtensionArray | np.ndarray | list, dtype: DtypeObj | None -) -> ExtensionArray | np.ndarray | list: +) -> ExtensionArray | np.ndarray: """ try to cast the array/value to a datetimelike dtype, converting float nan to iNaT @@ -1583,83 +1578,100 @@ def maybe_cast_to_datetime( We allow a list *only* when dtype is not None. """ from pandas.core.arrays.datetimes import sequence_to_datetimes - from pandas.core.arrays.timedeltas import sequence_to_td64ns + from pandas.core.arrays.timedeltas import TimedeltaArray if not is_list_like(value): raise TypeError("value must be listlike") + if is_timedelta64_dtype(dtype): + # TODO: _from_sequence would raise ValueError in cases where + # ensure_nanosecond_dtype raises TypeError + dtype = cast(np.dtype, dtype) + dtype = ensure_nanosecond_dtype(dtype) + res = TimedeltaArray._from_sequence(value, dtype=dtype) + return res + if dtype is not None: is_datetime64 = is_datetime64_dtype(dtype) is_datetime64tz = is_datetime64tz_dtype(dtype) - is_timedelta64 = is_timedelta64_dtype(dtype) vdtype = getattr(value, "dtype", None) - if is_datetime64 or is_datetime64tz or is_timedelta64: + if is_datetime64 or is_datetime64tz: dtype = ensure_nanosecond_dtype(dtype) - if not is_sparse(value): - value = np.array(value, copy=False) - - # we have an array of datetime or timedeltas & nulls - if value.size or not is_dtype_equal(value.dtype, dtype): - _disallow_mismatched_datetimelike(value, dtype) - - try: - if is_datetime64: - dta = sequence_to_datetimes(value, allow_object=False) - # GH 25843: Remove tz information since the dtype - # didn't specify one - - if dta.tz is not None: + value = np.array(value, copy=False) + + # we have an array of datetime or timedeltas & nulls + if value.size or not is_dtype_equal(value.dtype, dtype): + _disallow_mismatched_datetimelike(value, dtype) + + try: + if is_datetime64: + dta = sequence_to_datetimes(value, allow_object=False) + # GH 25843: Remove tz information since the dtype + # didn't specify one + + if dta.tz is not None: + warnings.warn( + "Data is timezone-aware. Converting " + "timezone-aware data to timezone-naive by " + "passing dtype='datetime64[ns]' to " + "DataFrame or Series is deprecated and will " + "raise in a future version. Use " + "`pd.Series(values).dt.tz_localize(None)` " + "instead.", + FutureWarning, + stacklevel=8, + ) + # equiv: dta.view(dtype) + # Note: NOT equivalent to dta.astype(dtype) + dta = dta.tz_localize(None) + + value = dta + elif is_datetime64tz: + dtype = cast(DatetimeTZDtype, dtype) + # The string check can be removed once issue #13712 + # is solved. String data that is passed with a + # datetime64tz is assumed to be naive which should + # be localized to the timezone. + is_dt_string = is_string_dtype(value.dtype) + dta = sequence_to_datetimes(value, allow_object=False) + if dta.tz is not None: + value = dta.astype(dtype, copy=False) + elif is_dt_string: + # Strings here are naive, so directly localize + # equiv: dta.astype(dtype) # though deprecated + + value = dta.tz_localize(dtype.tz) + else: + # Numeric values are UTC at this point, + # so localize and convert + # equiv: Series(dta).astype(dtype) # though deprecated + if getattr(vdtype, "kind", None) == "M": + # GH#24559, GH#33401 deprecate behavior inconsistent + # with DatetimeArray/DatetimeIndex warnings.warn( - "Data is timezone-aware. Converting " - "timezone-aware data to timezone-naive by " - "passing dtype='datetime64[ns]' to " - "DataFrame or Series is deprecated and will " - "raise in a future version. Use " - "`pd.Series(values).dt.tz_localize(None)` " - "instead.", + "In a future version, constructing a Series " + "from datetime64[ns] data and a " + "DatetimeTZDtype will interpret the data " + "as wall-times instead of " + "UTC times, matching the behavior of " + "DatetimeIndex. To treat the data as UTC " + "times, use pd.Series(data).dt" + ".tz_localize('UTC').tz_convert(dtype.tz) " + "or pd.Series(data.view('int64'), dtype=dtype)", FutureWarning, - stacklevel=8, + stacklevel=5, ) - # equiv: dta.view(dtype) - # Note: NOT equivalent to dta.astype(dtype) - dta = dta.tz_localize(None) - - value = dta - elif is_datetime64tz: - dtype = cast(DatetimeTZDtype, dtype) - # The string check can be removed once issue #13712 - # is solved. String data that is passed with a - # datetime64tz is assumed to be naive which should - # be localized to the timezone. - is_dt_string = is_string_dtype(value.dtype) - dta = sequence_to_datetimes(value, allow_object=False) - if dta.tz is not None: - value = dta.astype(dtype, copy=False) - elif is_dt_string: - # Strings here are naive, so directly localize - # equiv: dta.astype(dtype) # though deprecated - - value = dta.tz_localize(dtype.tz) - else: - # Numeric values are UTC at this point, - # so localize and convert - # equiv: Series(dta).astype(dtype) # though deprecated - - value = dta.tz_localize("UTC").tz_convert(dtype.tz) - elif is_timedelta64: - # if successful, we get a ndarray[td64ns] - value, _ = sequence_to_td64ns(value) - except OutOfBoundsDatetime: - raise - except ValueError: - # TODO(GH#40048): only catch dateutil's ParserError - # once we can reliably import it in all supported versions - if is_timedelta64: - raise - pass + + value = dta.tz_localize("UTC").tz_convert(dtype.tz) + except OutOfBoundsDatetime: + raise + except ValueError: + # TODO(GH#40048): only catch dateutil's ParserError + # once we can reliably import it in all supported versions + pass elif getattr(vdtype, "kind", None) in ["m", "M"]: # we are already datetimelike and want to coerce to non-datetimelike; @@ -1684,10 +1696,11 @@ def maybe_cast_to_datetime( "maybe_cast_to_datetime allows a list *only* if dtype is not None" ) - return value + # at this point we have converted or raised in all cases where we had a list + return cast(ArrayLike, value) -def sanitize_to_nanoseconds(values: np.ndarray) -> np.ndarray: +def sanitize_to_nanoseconds(values: np.ndarray, copy: bool = False) -> np.ndarray: """ Safely convert non-nanosecond datetime64 or timedelta64 values to nanosecond. """ @@ -1698,6 +1711,9 @@ def sanitize_to_nanoseconds(values: np.ndarray) -> np.ndarray: elif dtype.kind == "m" and dtype != TD64NS_DTYPE: values = conversion.ensure_timedelta64ns(values) + elif copy: + values = values.copy() + return values @@ -1950,7 +1966,7 @@ def construct_1d_object_array_from_listlike(values: Sized) -> np.ndarray: def construct_1d_ndarray_preserving_na( - values: Sequence, dtype: DtypeObj | None = None, copy: bool = False + values: Sequence, dtype: np.dtype | None = None, copy: bool = False ) -> np.ndarray: """ Construct a new ndarray, coercing `values` to `dtype`, preserving NA. @@ -1995,24 +2011,16 @@ def construct_1d_ndarray_preserving_na( and isinstance(values, np.ndarray) and values.dtype.kind == "f" ): - # Argument 2 to "astype_float_to_int_nansafe" has incompatible - # type "Union[dtype[Any], ExtensionDtype]"; expected "dtype[Any]" - return astype_float_to_int_nansafe( - values, dtype, copy=copy # type: ignore[arg-type] - ) + return astype_float_to_int_nansafe(values, dtype, copy=copy) else: - # error: Argument "dtype" to "array" has incompatible type - # "Union[dtype[Any], ExtensionDtype, None]"; expected "Union[dtype[Any], - # None, type, _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, - # Union[int, Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" - subarr = np.array(values, dtype=dtype, copy=copy) # type: ignore[arg-type] + subarr = np.array(values, dtype=dtype, copy=copy) return subarr def maybe_cast_to_integer_array( arr: list | np.ndarray, dtype: np.dtype, copy: bool = False -): +) -> np.ndarray: """ Takes any dtype and returns the casted version, raising for when data is incompatible with integer/unsigned integer dtypes. @@ -2083,6 +2091,20 @@ def maybe_cast_to_integer_array( if is_float_dtype(arr.dtype) or is_object_dtype(arr.dtype): raise ValueError("Trying to coerce float values to integers") + if casted.dtype < arr.dtype: + # GH#41734 e.g. [1, 200, 923442] and dtype="int8" -> overflows + warnings.warn( + f"Values are too large to be losslessly cast to {dtype}. " + "In a future version this will raise OverflowError. To retain the " + f"old behavior, use pd.Series(values).astype({dtype})", + FutureWarning, + stacklevel=find_stack_level(), + ) + return casted + + # No known cases that get here, but raising explicitly to cover our bases. + raise ValueError(f"values cannot be losslessly cast to {dtype}") + def convert_scalar_for_putitemlike(scalar: Scalar, dtype: np.dtype) -> Scalar: """ diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 8c2cff21c114e..2cbf1a8063a92 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -37,7 +37,11 @@ is_string_dtype, needs_i8_conversion, ) -from pandas.core.dtypes.dtypes import ExtensionDtype +from pandas.core.dtypes.dtypes import ( + ExtensionDtype, + IntervalDtype, + PeriodDtype, +) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCExtensionArray, @@ -630,7 +634,13 @@ def is_valid_na_for_dtype(obj, dtype: DtypeObj) -> bool: # This is needed for Categorical, but is kind of weird return True - # must be PeriodDType + elif isinstance(dtype, PeriodDtype): + return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal)) + + elif isinstance(dtype, IntervalDtype): + return lib.is_float(obj) or obj is None or obj is libmissing.NA + + # fallback, default to allowing NaN, None, NA, NaT return not isinstance(obj, (np.datetime64, np.timedelta64, Decimal)) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 18ee1ad9bcd96..7545ea9a0733c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -94,7 +94,6 @@ infer_dtype_from_scalar, invalidate_string_dtypes, maybe_box_native, - maybe_convert_platform, maybe_downcast_to_dtype, validate_numeric_casting, ) @@ -260,6 +259,8 @@ _merge_doc = """ Merge DataFrame or named Series objects with a database-style join. +A named Series object is treated as a DataFrame with a single named column. + The join is done on columns or indexes. If joining columns on columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes on indexes or indexes on a column or columns, the index will be passed on. @@ -727,6 +728,15 @@ def __init__( if index is None or columns is None: raise ValueError("DataFrame constructor not properly called!") + # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; + # expected "Union[Union[Union[ExtensionArray, ndarray], + # Index, Series], Sequence[Any]]" + index = ensure_index(index) # type: ignore[arg-type] + # Argument 1 to "ensure_index" has incompatible type "Collection[Any]"; + # expected "Union[Union[Union[ExtensionArray, ndarray], + # Index, Series], Sequence[Any]]" + columns = ensure_index(columns) # type: ignore[arg-type] + if not dtype: dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True) @@ -2324,6 +2334,7 @@ def _from_arrays( dtype = pandas_dtype(dtype) manager = get_option("mode.data_manager") + columns = ensure_index(columns) mgr = arrays_to_mgr( arrays, columns, @@ -4498,35 +4509,11 @@ def _sanitize_column(self, value) -> ArrayLike: # We should never get here with DataFrame value if isinstance(value, Series): - value = _reindex_for_setitem(value, self.index) + return _reindex_for_setitem(value, self.index) - elif isinstance(value, ExtensionArray): - # Explicitly copy here - value = value.copy() + if is_list_like(value): com.require_length_match(value, self.index) - - elif is_sequence(value): - com.require_length_match(value, self.index) - - # turn me into an ndarray - if not isinstance(value, (np.ndarray, Index)): - if isinstance(value, list) and len(value) > 0: - value = maybe_convert_platform(value) - else: - value = com.asarray_tuplesafe(value) - elif isinstance(value, Index): - value = value.copy(deep=True)._values - else: - value = value.copy() - - # possibly infer to datetimelike - if is_object_dtype(value.dtype): - value = sanitize_array(value, None) - - else: - value = construct_1d_arraylike_from_scalar(value, len(self), dtype=None) - - return value + return sanitize_array(value, self.index, copy=True, allow_2d=True) @property def _series(self): @@ -4726,6 +4713,7 @@ def set_axis( ) -> DataFrame | None: ... + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) @Appender( """ Examples @@ -4789,6 +4777,7 @@ def reindex(self, *args, **kwargs) -> DataFrame: kwargs.pop("labels", None) return super().reindex(**kwargs) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) def drop( self, labels=None, @@ -5353,6 +5342,7 @@ def shift( periods=periods, freq=freq, axis=axis, fill_value=fill_value ) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "keys"]) def set_index( self, keys, @@ -5619,6 +5609,7 @@ def reset_index( ) -> DataFrame | None: ... + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"]) def reset_index( self, level: Hashable | Sequence[Hashable] | None = None, @@ -5856,6 +5847,7 @@ def notna(self) -> DataFrame: def notnull(self) -> DataFrame: return ~self.isna() + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def dropna( self, axis: Axis = 0, @@ -6005,6 +5997,7 @@ def dropna( else: return result + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "subset"]) def drop_duplicates( self, subset: Hashable | Sequence[Hashable] | None = None, @@ -6240,6 +6233,7 @@ def f(vals) -> tuple[np.ndarray, int]: # ---------------------------------------------------------------------- # Sorting # TODO: Just move the sort_values doc here. + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "by"]) @Substitution(**_shared_doc_kwargs) @Appender(NDFrame.sort_values.__doc__) # error: Signature of "sort_values" incompatible with supertype "NDFrame" @@ -6314,6 +6308,7 @@ def sort_values( # type: ignore[override] else: return result.__finalize__(self, method="sort_values") + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def sort_index( self, axis: Axis = 0, @@ -8926,10 +8921,7 @@ def append( index = Index([other.name], name=self.index.name) idx_diff = other.index.difference(self.columns) - try: - combined_columns = self.columns.append(idx_diff) - except TypeError: - combined_columns = self.columns.astype(object).append(idx_diff) + combined_columns = self.columns.append(idx_diff) other = ( other.reindex(combined_columns, copy=False) .to_frame() @@ -9780,7 +9772,6 @@ def _reduce( **kwds, ): - min_count = kwds.get("min_count", 0) assert filter_type is None or filter_type == "bool", filter_type out_dtype = "bool" if filter_type == "bool" else None @@ -9829,7 +9820,7 @@ def _get_data() -> DataFrame: data = self._get_bool_data() return data - if (numeric_only is not None or axis == 0) and min_count == 0: + if numeric_only is not None or axis == 0: # For numeric_only non-None and axis non-None, we know # which blocks to use and no try/except is needed. # For numeric_only=None only the case with axis==0 and no object @@ -10663,6 +10654,40 @@ def values(self) -> np.ndarray: self._consolidate_inplace() return self._mgr.as_array(transpose=True) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def ffill( + self: DataFrame, + axis: None | Axis = None, + inplace: bool = False, + limit: None | int = None, + downcast=None, + ) -> DataFrame | None: + return super().ffill(axis, inplace, limit, downcast) + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def bfill( + self: DataFrame, + axis: None | Axis = None, + inplace: bool = False, + limit: None | int = None, + downcast=None, + ) -> DataFrame | None: + return super().bfill(axis, inplace, limit, downcast) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "lower", "upper"] + ) + def clip( + self: DataFrame, + lower=None, + upper=None, + axis: Axis | None = None, + inplace: bool = False, + *args, + **kwargs, + ) -> DataFrame | None: + return super().clip(lower, upper, axis, inplace, *args, **kwargs) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"]) def interpolate( self: DataFrame, @@ -10686,6 +10711,36 @@ def interpolate( **kwargs, ) + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "cond", "other"] + ) + def where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=lib.no_default, + ): + return super().where(cond, other, inplace, axis, level, errors, try_cast) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "cond", "other"] + ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=lib.no_default, + ): + return super().mask(cond, other, inplace, axis, level, errors, try_cast) + DataFrame._add_numeric_operations() diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6d7c803685255..49dc71954fd8f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6392,47 +6392,6 @@ def fillna( else: return result.__finalize__(self, method="fillna") - @overload - def ffill( - self: FrameOrSeries, - axis: None | Axis = ..., - inplace: Literal[False] = ..., - limit: None | int = ..., - downcast=..., - ) -> FrameOrSeries: - ... - - @overload - def ffill( - self: FrameOrSeries, - axis: None | Axis, - inplace: Literal[True], - limit: None | int = ..., - downcast=..., - ) -> None: - ... - - @overload - def ffill( - self: FrameOrSeries, - *, - inplace: Literal[True], - limit: None | int = ..., - downcast=..., - ) -> None: - ... - - @overload - def ffill( - self: FrameOrSeries, - axis: None | Axis = ..., - inplace: bool_t = ..., - limit: None | int = ..., - downcast=..., - ) -> FrameOrSeries | None: - ... - - @final @doc(klass=_shared_doc_kwargs["klass"]) def ffill( self: FrameOrSeries, @@ -6455,47 +6414,6 @@ def ffill( pad = ffill - @overload - def bfill( - self: FrameOrSeries, - axis: None | Axis = ..., - inplace: Literal[False] = ..., - limit: None | int = ..., - downcast=..., - ) -> FrameOrSeries: - ... - - @overload - def bfill( - self: FrameOrSeries, - axis: None | Axis, - inplace: Literal[True], - limit: None | int = ..., - downcast=..., - ) -> None: - ... - - @overload - def bfill( - self: FrameOrSeries, - *, - inplace: Literal[True], - limit: None | int = ..., - downcast=..., - ) -> None: - ... - - @overload - def bfill( - self: FrameOrSeries, - axis: None | Axis = ..., - inplace: bool_t = ..., - limit: None | int = ..., - downcast=..., - ) -> FrameOrSeries | None: - ... - - @final @doc(klass=_shared_doc_kwargs["klass"]) def bfill( self: FrameOrSeries, @@ -7365,115 +7283,6 @@ def _clip_with_one_bound(self, threshold, method, axis, inplace): # GH 40420 return self.where(subset, threshold, axis=axis, inplace=inplace) - @overload - def clip( - self: FrameOrSeries, - lower=..., - upper=..., - axis: Axis | None = ..., - inplace: Literal[False] = ..., - *args, - **kwargs, - ) -> FrameOrSeries: - ... - - @overload - def clip( - self: FrameOrSeries, - lower, - *, - axis: Axis | None, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - lower, - *, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - *, - upper, - axis: Axis | None, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - *, - upper, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - *, - axis: Axis | None, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - lower, - upper, - axis: Axis | None, - inplace: Literal[True], - *args, - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - lower, - upper, - *, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - *, - inplace: Literal[True], - **kwargs, - ) -> None: - ... - - @overload - def clip( - self: FrameOrSeries, - lower=..., - upper=..., - axis: Axis | None = ..., - inplace: bool_t = ..., - *args, - **kwargs, - ) -> FrameOrSeries | None: - ... - - @final def clip( self: FrameOrSeries, lower=None, @@ -9073,7 +8882,6 @@ def _where( result = self._constructor(new_data) return result.__finalize__(self) - @final @doc( klass=_shared_doc_kwargs["klass"], cond="True", @@ -9221,7 +9029,7 @@ def where( "try_cast keyword is deprecated and will be removed in a " "future version", FutureWarning, - stacklevel=2, + stacklevel=4, ) return self._where(cond, other, inplace, axis, level, errors=errors) @@ -9254,7 +9062,7 @@ def mask( "try_cast keyword is deprecated and will be removed in a " "future version", FutureWarning, - stacklevel=2, + stacklevel=4, ) # see gh-21891 @@ -9413,7 +9221,7 @@ def shift( else: new_ax = index.shift(periods, freq) - result = self.set_axis(new_ax, axis) + result = self.set_axis(new_ax, axis=axis) return result.__finalize__(self, method="shift") @final diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index c38c51d46f83e..69f992f840c7c 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -67,10 +67,7 @@ validate_func_kwargs, ) from pandas.core.apply import GroupByApply -from pandas.core.base import ( - DataError, - SpecificationError, -) +from pandas.core.base import SpecificationError import pandas.core.common as com from pandas.core.construction import create_series_with_explicit_dtype from pandas.core.frame import DataFrame @@ -323,7 +320,7 @@ def _aggregate_multiple_funcs(self, arg) -> DataFrame: return output def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ): obj = self._selected_obj @@ -331,7 +328,10 @@ def _cython_agg_general( data = obj._mgr if numeric_only and not is_numeric_dtype(obj.dtype): - raise DataError("No numeric types to aggregate") + # GH#41291 match Series behavior + raise NotImplementedError( + f"{type(self).__name__}.{how} does not implement numeric_only." + ) # This is overkill because it is only called once, but is here to # mirror the array_func used in DataFrameGroupBy._cython_agg_general @@ -513,16 +513,12 @@ def _cython_transform( obj = self._selected_obj - is_numeric = is_numeric_dtype(obj.dtype) - if numeric_only and not is_numeric: - raise DataError("No numeric types to aggregate") - try: result = self.grouper._cython_operation( "transform", obj._values, how, axis, **kwargs ) - except (NotImplementedError, TypeError): - raise DataError("No numeric types to aggregate") + except NotImplementedError as err: + raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err return obj._constructor(result, index=self.obj.index, name=obj.name) @@ -1056,12 +1052,11 @@ def _iterate_slices(self) -> Iterable[Series]: yield values def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ) -> DataFrame: # Note: we never get here with how="ohlc"; that goes through SeriesGroupBy data: Manager2D = self._get_data_to_aggregate() - orig = data if numeric_only: data = data.get_numeric_data(copy=False) @@ -1084,9 +1079,15 @@ def array_func(values: ArrayLike) -> ArrayLike: # continue and exclude the block new_mgr = data.grouped_reduce(array_func, ignore_failures=True) - if not len(new_mgr) and len(orig): - # If the original Manager was already empty, no need to raise - raise DataError("No numeric types to aggregate") + if len(new_mgr) < len(data): + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.{how} " + "is deprecated. In a future version, a TypeError will be raised. " + f"Before calling .{how}, select only columns which should be " + "valid for the function.", + FutureWarning, + stacklevel=4, + ) return self._wrap_agged_manager(new_mgr) @@ -1283,6 +1284,16 @@ def arr_func(bvalues: ArrayLike) -> ArrayLike: res_mgr = mgr.grouped_reduce(arr_func, ignore_failures=True) res_mgr.set_axis(1, mgr.axes[1]) + if len(res_mgr) < len(mgr): + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.{how} " + "is deprecated. In a future version, a TypeError will be raised. " + f"Before calling .{how}, select only columns which should be " + "valid for the transforming function.", + FutureWarning, + stacklevel=4, + ) + res_df = self.obj._constructor(res_mgr) if self.axis == 1: res_df = res_df.T @@ -1420,7 +1431,14 @@ def _transform_item_by_item(self, obj: DataFrame, wrapper) -> DataFrame: output[i] = sgb.transform(wrapper) except TypeError: # e.g. trying to call nanmean with string values - pass + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.transform " + "is deprecated. In a future version, a TypeError will be raised. " + "Before calling .transform, select only columns which should be " + "valid for the transforming function.", + FutureWarning, + stacklevel=5, + ) else: inds.append(i) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b27eb4bb8f325..6deb5bb1a76f0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -30,6 +30,7 @@ class providing the base-class of operations. Union, cast, ) +import warnings import numpy as np @@ -1100,6 +1101,34 @@ def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]): def _wrap_applied_output(self, data, keys, values, not_indexed_same: bool = False): raise AbstractMethodError(self) + def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: + """ + Determine subclass-specific default value for 'numeric_only'. + + For SeriesGroupBy we want the default to be False (to match Series behavior). + For DataFrameGroupBy we want it to be True (for backwards-compat). + + Parameters + ---------- + numeric_only : bool or lib.no_default + + Returns + ------- + bool + """ + # GH#41291 + if numeric_only is lib.no_default: + # i.e. not explicitly passed by user + if self.obj.ndim == 2: + # i.e. DataFrameGroupBy + numeric_only = True + else: + numeric_only = False + + # error: Incompatible return value type (got "Union[bool, NoDefault]", + # expected "bool") + return numeric_only # type: ignore[return-value] + # ----------------------------------------------------------------- # numba @@ -1131,10 +1160,16 @@ def _transform_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) group_keys = self.grouper._get_group_keys() numba_transform_func = numba_.generate_numba_transform_func( - tuple(args), kwargs, func, engine_kwargs + kwargs, func, engine_kwargs ) result = numba_transform_func( - sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) + sorted_data, + sorted_index, + starts, + ends, + len(group_keys), + len(data.columns), + *args, ) cache_key = (func, "groupby_transform") @@ -1157,11 +1192,15 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) starts, ends, sorted_index, sorted_data = self._numba_prep(func, data) group_keys = self.grouper._get_group_keys() - numba_agg_func = numba_.generate_numba_agg_func( - tuple(args), kwargs, func, engine_kwargs - ) + numba_agg_func = numba_.generate_numba_agg_func(kwargs, func, engine_kwargs) result = numba_agg_func( - sorted_data, sorted_index, starts, ends, len(group_keys), len(data.columns) + sorted_data, + sorted_index, + starts, + ends, + len(group_keys), + len(data.columns), + *args, ) cache_key = (func, "groupby_agg") @@ -1270,6 +1309,14 @@ def _python_agg_general(self, func, *args, **kwargs): # if this function is invalid for this dtype, we will ignore it. result = self.grouper.agg_series(obj, f) except TypeError: + warnings.warn( + f"Dropping invalid columns in {type(self).__name__}.agg " + "is deprecated. In a future version, a TypeError will be raised. " + "Before calling .agg, select only columns which should be " + "valid for the aggregating function.", + FutureWarning, + stacklevel=3, + ) continue key = base.OutputKey(label=name, position=idx) @@ -1289,22 +1336,15 @@ def _agg_general( alias: str, npfunc: Callable, ): + with group_selection_context(self): # try a cython aggregation if we can - result = None - try: - result = self._cython_agg_general( - how=alias, - alt=npfunc, - numeric_only=numeric_only, - min_count=min_count, - ) - except DataError: - pass - - # apply a non-cython aggregation - if result is None: - result = self.aggregate(lambda x: npfunc(x, axis=self.axis)) + result = self._cython_agg_general( + how=alias, + alt=npfunc, + numeric_only=numeric_only, + min_count=min_count, + ) return result.__finalize__(self.obj, method="groupby") def _agg_py_fallback( @@ -1348,7 +1388,7 @@ def _agg_py_fallback( return ensure_block_shape(res_values, ndim=ndim) def _cython_agg_general( - self, how: str, alt=None, numeric_only: bool = True, min_count: int = -1 + self, how: str, alt: Callable, numeric_only: bool, min_count: int = -1 ): raise AbstractMethodError(self) @@ -1568,7 +1608,7 @@ def count(self): @final @Substitution(name="groupby") @Substitution(see_also=_common_see_also) - def mean(self, numeric_only: bool = True): + def mean(self, numeric_only: bool | lib.NoDefault = lib.no_default): """ Compute mean of groups, excluding missing values. @@ -1616,6 +1656,8 @@ def mean(self, numeric_only: bool = True): 2 4.0 Name: B, dtype: float64 """ + numeric_only = self._resolve_numeric_only(numeric_only) + result = self._cython_agg_general( "mean", alt=lambda x: Series(x).mean(numeric_only=numeric_only), @@ -1626,7 +1668,7 @@ def mean(self, numeric_only: bool = True): @final @Substitution(name="groupby") @Appender(_common_see_also) - def median(self, numeric_only=True): + def median(self, numeric_only: bool | lib.NoDefault = lib.no_default): """ Compute median of groups, excluding missing values. @@ -1643,6 +1685,8 @@ def median(self, numeric_only=True): Series or DataFrame Median of values within each group. """ + numeric_only = self._resolve_numeric_only(numeric_only) + result = self._cython_agg_general( "median", alt=lambda x: Series(x).median(numeric_only=numeric_only), @@ -1700,8 +1744,9 @@ def var(self, ddof: int = 1): Variance of values within each group. """ if ddof == 1: + numeric_only = self._resolve_numeric_only(lib.no_default) return self._cython_agg_general( - "var", alt=lambda x: Series(x).var(ddof=ddof) + "var", alt=lambda x: Series(x).var(ddof=ddof), numeric_only=numeric_only ) else: func = lambda x: x.var(ddof=ddof) @@ -1766,7 +1811,10 @@ def size(self) -> FrameOrSeriesUnion: @final @doc(_groupby_agg_method_template, fname="sum", no=True, mc=0) - def sum(self, numeric_only: bool = True, min_count: int = 0): + def sum( + self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + ): + numeric_only = self._resolve_numeric_only(numeric_only) # If we are grouping on categoricals we want unobserved categories to # return zero, rather than the default of NaN which the reindexing in @@ -1783,7 +1831,11 @@ def sum(self, numeric_only: bool = True, min_count: int = 0): @final @doc(_groupby_agg_method_template, fname="prod", no=True, mc=0) - def prod(self, numeric_only: bool = True, min_count: int = 0): + def prod( + self, numeric_only: bool | lib.NoDefault = lib.no_default, min_count: int = 0 + ): + numeric_only = self._resolve_numeric_only(numeric_only) + return self._agg_general( numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod ) @@ -2712,7 +2764,7 @@ def _get_cythonized_result( how: str, cython_dtype: np.dtype, aggregate: bool = False, - numeric_only: bool = True, + numeric_only: bool | lib.NoDefault = lib.no_default, needs_counts: bool = False, needs_values: bool = False, needs_2d: bool = False, @@ -2780,6 +2832,8 @@ def _get_cythonized_result( ------- `Series` or `DataFrame` with filled values """ + numeric_only = self._resolve_numeric_only(numeric_only) + if result_is_index and aggregate: raise ValueError("'result_is_index' and 'aggregate' cannot both be True!") if post_processing and not callable(post_processing): @@ -2829,6 +2883,16 @@ def _get_cythonized_result( vals, inferences = pre_processing(vals) except TypeError as err: error_msg = str(err) + howstr = how.replace("group_", "") + warnings.warn( + "Dropping invalid columns in " + f"{type(self).__name__}.{howstr} is deprecated. " + "In a future version, a TypeError will be raised. " + f"Before calling .{howstr}, select only columns which " + "should be valid for the function.", + FutureWarning, + stacklevel=3, + ) continue vals = vals.astype(cython_dtype, copy=False) if needs_2d: diff --git a/pandas/core/groupby/numba_.py b/pandas/core/groupby/numba_.py index 26070fcb5e89c..ad78280c5d835 100644 --- a/pandas/core/groupby/numba_.py +++ b/pandas/core/groupby/numba_.py @@ -56,11 +56,12 @@ def f(values, index, ...): def generate_numba_agg_func( - args: tuple, kwargs: dict[str, Any], func: Callable[..., Scalar], engine_kwargs: dict[str, bool] | None, -) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: +) -> Callable[ + [np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int, Any], np.ndarray +]: """ Generate a numba jitted agg function specified by values from engine_kwargs. @@ -72,8 +73,6 @@ def generate_numba_agg_func( Parameters ---------- - args : tuple - *args to be passed into the function kwargs : dict **kwargs to be passed into the function func : function @@ -103,6 +102,7 @@ def group_agg( end: np.ndarray, num_groups: int, num_columns: int, + *args: Any, ) -> np.ndarray: result = np.empty((num_groups, num_columns)) for i in numba.prange(num_groups): @@ -116,11 +116,12 @@ def group_agg( def generate_numba_transform_func( - args: tuple, kwargs: dict[str, Any], func: Callable[..., np.ndarray], engine_kwargs: dict[str, bool] | None, -) -> Callable[[np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int], np.ndarray]: +) -> Callable[ + [np.ndarray, np.ndarray, np.ndarray, np.ndarray, int, int, Any], np.ndarray +]: """ Generate a numba jitted transform function specified by values from engine_kwargs. @@ -132,8 +133,6 @@ def generate_numba_transform_func( Parameters ---------- - args : tuple - *args to be passed into the function kwargs : dict **kwargs to be passed into the function func : function @@ -163,6 +162,7 @@ def group_transform( end: np.ndarray, num_groups: int, num_columns: int, + *args: Any, ) -> np.ndarray: result = np.empty((len(values), num_columns)) for i in numba.prange(num_groups): diff --git a/pandas/core/indexers.py b/pandas/core/indexers.py index 4f3f536cd3290..ed4b1a3fbb39c 100644 --- a/pandas/core/indexers.py +++ b/pandas/core/indexers.py @@ -166,6 +166,8 @@ def check_setitem_lengths(indexer, value, values) -> bool: if is_list_like(value): if len(indexer) != len(value) and values.ndim == 1: # boolean with truth values == len of the value is ok too + if isinstance(indexer, list): + indexer = np.array(indexer) if not ( isinstance(indexer, np.ndarray) and indexer.dtype == np.bool_ diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index b9fd18dfdce73..124903446220d 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -54,6 +54,7 @@ from pandas.util._decorators import ( Appender, cache_readonly, + deprecate_nonkeyword_arguments, doc, ) @@ -76,7 +77,6 @@ is_float_dtype, is_hashable, is_integer, - is_integer_dtype, is_interval_dtype, is_iterator, is_list_like, @@ -775,6 +775,7 @@ def _engine(self) -> libindex.IndexEngine: target_values = self._get_engine_target() return self._engine_type(lambda: target_values, len(self)) + @final @cache_readonly def _dir_additions_for_owner(self) -> set[str_t]: """ @@ -813,6 +814,7 @@ def __array_wrap__(self, result, context=None): return result attrs = self._get_attributes_dict() + attrs.pop("freq", None) # For DatetimeIndex/TimedeltaIndex return Index(result, **attrs) @cache_readonly @@ -904,13 +906,10 @@ def astype(self, dtype, copy=True): if is_dtype_equal(self.dtype, dtype): return self.copy() if copy else self - elif is_categorical_dtype(dtype): - from pandas.core.indexes.category import CategoricalIndex - - return CategoricalIndex(self, name=self.name, dtype=dtype, copy=copy) - - elif is_extension_array_dtype(dtype): - return Index(np.asarray(self), name=self.name, dtype=dtype, copy=copy) + elif isinstance(dtype, ExtensionDtype): + cls = dtype.construct_array_type() + new_values = cls._from_sequence(self, dtype=dtype, copy=False) + return Index(new_values, dtype=dtype, copy=copy, name=self.name) try: casted = self._values.astype(dtype, copy=copy) @@ -929,19 +928,20 @@ def astype(self, dtype, copy=True): Parameters ---------- - indices : list + indices : array-like Indices to be taken. axis : int, optional The axis over which to select values, always 0. allow_fill : bool, default True - fill_value : bool, default None + fill_value : scalar, default None If allow_fill=True and fill_value is not None, indices specified by - -1 is regarded as NA. If Index doesn't hold NA, raise ValueError. + -1 are regarded as NA. If Index doesn't hold NA, raise ValueError. Returns ------- - numpy.ndarray - Elements of given indices. + Index + An index formed of elements at the given indices. Will be the same + type as self, except for RangeIndex. See Also -------- @@ -950,7 +950,9 @@ def astype(self, dtype, copy=True): """ @Appender(_index_shared_docs["take"] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, fill_value=None, **kwargs): + def take( + self, indices, axis: int = 0, allow_fill: bool = True, fill_value=None, **kwargs + ): if kwargs: nv.validate_take((), kwargs) indices = ensure_platform_int(indices) @@ -1155,18 +1157,25 @@ def _format_data(self, name=None) -> str_t: is_justify = False return format_object_summary( - self, self._formatter_func, is_justify=is_justify, name=name + self, + self._formatter_func, + is_justify=is_justify, + name=name, + line_break_each_value=self._is_multi, ) - def _format_attrs(self): + def _format_attrs(self) -> list[tuple[str_t, str_t | int]]: """ Return a list of tuples of the (attr,formatted_value). """ - return format_object_attrs(self) + return format_object_attrs(self, include_dtype=not self._is_multi) - def _mpl_repr(self): + @final + def _mpl_repr(self) -> np.ndarray: # how to represent ourselves to matplotlib - return self.values + if isinstance(self.dtype, np.dtype) and self.dtype.kind != "M": + return cast(np.ndarray, self.values) + return self.astype(object, copy=False)._values def format( self, @@ -1526,7 +1535,7 @@ def _set_names(self, values, level=None) -> None: names = property(fset=_set_names, fget=_get_names) - @final + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "names"]) def set_names(self, names, level=None, inplace: bool = False): """ Set Index or MultiIndex name. @@ -2404,6 +2413,13 @@ def is_all_dates(self) -> bool: ) return self._is_all_dates + @cache_readonly + def _is_multi(self) -> bool: + """ + Cached check equivalent to isinstance(self, MultiIndex) + """ + return isinstance(self, ABCMultiIndex) + # -------------------------------------------------------------------- # Pickle Methods @@ -2633,7 +2649,7 @@ def unique(self: _IndexT, level: Hashable | None = None) -> _IndexT: result = super().unique() return self._shallow_copy(result) - @final + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def drop_duplicates(self: _IndexT, keep: str_t | bool = "first") -> _IndexT: """ Return Index with duplicate values removed. @@ -2946,20 +2962,7 @@ def union(self, other, sort=None): stacklevel=2, ) - dtype = find_common_type([self.dtype, other.dtype]) - if self._is_numeric_dtype and other._is_numeric_dtype: - # Right now, we treat union(int, float) a bit special. - # See https://github.com/pandas-dev/pandas/issues/26778 for discussion - # We may change union(int, float) to go to object. - # float | [u]int -> float (the special case) - # | -> T - # | -> object - if not (is_integer_dtype(self.dtype) and is_integer_dtype(other.dtype)): - dtype = np.dtype("float64") - else: - # one is int64 other is uint64 - dtype = np.dtype("object") - + dtype = self._find_common_type_compat(other) left = self.astype(dtype, copy=False) right = other.astype(dtype, copy=False) return left.union(right, sort=sort) @@ -5393,6 +5396,19 @@ def _find_common_type_compat(self, target) -> DtypeObj: return IntervalDtype(np.float64, closed=self.closed) target_dtype, _ = infer_dtype_from(target, pandas_dtype=True) + + # special case: if one dtype is uint64 and the other a signed int, return object + # See https://github.com/pandas-dev/pandas/issues/26778 for discussion + # Now it's: + # * float | [u]int -> float + # * uint64 | signed int -> object + # We may change union(float | [u]int) to go to object. + if self.dtype == "uint64" or target_dtype == "uint64": + if is_signed_integer_dtype(self.dtype) or is_signed_integer_dtype( + target_dtype + ): + return np.dtype("object") + dtype = find_common_type([self.dtype, target_dtype]) if dtype.kind in ["i", "u"]: # TODO: what about reversed with self being categorical? @@ -6193,6 +6209,7 @@ def shape(self) -> Shape: # See GH#27775, GH#27384 for history/reasoning in how this is defined. return (len(self),) + @final def _deprecated_arg(self, value, name: str_t, methodname: str_t) -> None: """ Issue a FutureWarning if the arg/kwarg is not no_default. @@ -6281,27 +6298,18 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind if copy: index_like = index_like.copy() return index_like - if hasattr(index_like, "name"): - # https://github.com/python/mypy/issues/1424 - # error: Item "ExtensionArray" of "Union[ExtensionArray, - # Sequence[Any]]" has no attribute "name" - # error: Item "Sequence[Any]" of "Union[ExtensionArray, Sequence[Any]]" - # has no attribute "name" - # error: "Sequence[Any]" has no attribute "name" - # error: Item "Sequence[Any]" of "Union[Series, Sequence[Any]]" has no - # attribute "name" - # error: Item "Sequence[Any]" of "Union[Any, Sequence[Any]]" has no - # attribute "name" - name = index_like.name # type: ignore[union-attr, attr-defined] + + if isinstance(index_like, ABCSeries): + name = index_like.name return Index(index_like, name=name, copy=copy) if is_iterator(index_like): index_like = list(index_like) - # must check for exactly list here because of strict type - # check in clean_index_list if isinstance(index_like, list): - if type(index_like) != list: + if type(index_like) is not list: + # must check for exactly list here because of strict type + # check in clean_index_list index_like = list(index_like) converted, all_arrays = lib.clean_index_list(index_like) @@ -6311,13 +6319,6 @@ def ensure_index(index_like: AnyArrayLike | Sequence, copy: bool = False) -> Ind return MultiIndex.from_arrays(converted) else: - if isinstance(converted, np.ndarray) and converted.dtype == np.int64: - # Check for overflows if we should actually be uint64 - # xref GH#35481 - alt = np.asarray(index_like) - if alt.dtype == np.uint64: - converted = alt - index_like = converted else: # clean_index_list does the equivalent of copying @@ -6425,12 +6426,8 @@ def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike: return data elif inferred == "interval": - try: - ia_data = IntervalArray._from_sequence(subarr, copy=False) - return ia_data - except (ValueError, TypeError): - # GH27172: mixed closed Intervals --> object dtype - pass + ia_data = IntervalArray._from_sequence(subarr, copy=False) + return ia_data elif inferred == "boolean": # don't support boolean explicitly ATM pass @@ -6449,11 +6446,8 @@ def _maybe_cast_data_without_dtype(subarr: np.ndarray) -> ArrayLike: tda = TimedeltaArray._from_sequence(subarr, copy=False) return tda elif inferred == "period": - try: - parr = PeriodArray._from_sequence(subarr) - return parr - except IncompatibleFrequency: - pass + parr = PeriodArray._from_sequence(subarr) + return parr return subarr diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index e835990eb8d89..1541885887dab 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -222,6 +222,17 @@ def __new__( name = maybe_extract_name(name, data, cls) + if data is None: + # GH#38944 + warnings.warn( + "Constructing a CategoricalIndex without passing data is " + "deprecated and will raise in a future version. " + "Use CategoricalIndex([], ...) instead", + FutureWarning, + stacklevel=2, + ) + data = [] + if is_scalar(data): raise cls._scalar_data_error(data) @@ -324,13 +335,8 @@ def _format_attrs(self): # error: "CategoricalIndex" has no attribute "ordered" ("ordered", self.ordered), # type: ignore[attr-defined] ] - if self.name is not None: - attrs.append(("name", ibase.default_pprint(self.name))) - attrs.append(("dtype", f"'{self.dtype.name}'")) - max_seq_items = get_option("display.max_seq_items") or len(self) - if len(self) > max_seq_items: - attrs.append(("length", len(self))) - return attrs + extra = super()._format_attrs() + return attrs + extra def _format_with_header(self, header: list[str], na_rep: str = "NaN") -> list[str]: from pandas.io.formats.printing import pprint_thing diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index b2377f5b27966..5f24eb0cfaad6 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -35,7 +35,6 @@ ) from pandas.core.dtypes.common import ( - is_bool_dtype, is_categorical_dtype, is_dtype_equal, is_integer, @@ -83,6 +82,7 @@ class DatetimeIndexOpsMixin(NDArrayBackedExtensionIndex): Common ops mixin to support a unified interface datetimelike Index. """ + _is_numeric_dtype = False _can_hold_strings = False _data: DatetimeArray | TimedeltaArray | PeriodArray freq: BaseOffset | None @@ -113,15 +113,10 @@ def __array_wrap__(self, result, context=None): """ Gets called after a ufunc and other functions. """ - result = lib.item_from_zerodim(result) - if is_bool_dtype(result) or lib.is_scalar(result): - return result - - attrs = self._get_attributes_dict() - if not is_period_dtype(self.dtype) and attrs["freq"]: - # no need to infer if freq is None - attrs["freq"] = "infer" - return type(self)(result, **attrs) + out = super().__array_wrap__(result, context=context) + if isinstance(out, DatetimeTimedeltaMixin) and self.freq is not None: + out = out._with_freq("infer") + return out # ------------------------------------------------------------------------ @@ -361,7 +356,9 @@ def _format_attrs(self): freq = self.freqstr if freq is not None: freq = repr(freq) - attrs.append(("freq", freq)) + # Argument 1 to "append" of "list" has incompatible type + # "Tuple[str, Optional[str]]"; expected "Tuple[str, Union[str, int]]" + attrs.append(("freq", freq)) # type: ignore[arg-type] return attrs def _summary(self, name=None) -> str: @@ -612,6 +609,8 @@ class DatetimeTimedeltaMixin(DatetimeIndexOpsMixin): """ _data: DatetimeArray | TimedeltaArray + _comparables = ["name", "freq"] + _attributes = ["name", "freq"] # Compat for frequency inference, see GH#23789 _is_monotonic_increasing = Index.is_monotonic_increasing diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index ac09159c23566..c4329393bb895 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -25,7 +25,6 @@ ) from pandas._libs.tslibs import ( Resolution, - ints_to_pydatetime, parsing, timezones, to_offset, @@ -257,11 +256,6 @@ class DatetimeIndex(DatetimeTimedeltaMixin): _engine_type = libindex.DatetimeEngine _supports_partial_string_indexing = True - _comparables = ["name", "freqstr", "tz"] - _attributes = ["name", "tz", "freq"] - - _is_numeric_dtype = False - _data: DatetimeArray inferred_freq: str | None tz: tzinfo | None @@ -392,10 +386,6 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: # -------------------------------------------------------------------- # Rendering Methods - def _mpl_repr(self) -> np.ndarray: - # how to represent ourselves to matplotlib - return ints_to_pydatetime(self.asi8, self.tz) - @property def _formatter_func(self): from pandas.io.formats.format import get_format_datetime64 diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index 83998a2792a8a..b1cabf92bf985 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -13,16 +13,12 @@ from pandas._typing import ArrayLike from pandas.compat.numpy import function as nv -from pandas.errors import AbstractMethodError from pandas.util._decorators import ( cache_readonly, doc, ) +from pandas.util._exceptions import rewrite_exception -from pandas.core.dtypes.cast import ( - find_common_type, - infer_dtype_from, -) from pandas.core.dtypes.common import ( is_dtype_equal, is_object_dtype, @@ -33,6 +29,7 @@ ABCSeries, ) +from pandas.core.array_algos.putmask import validate_putmask from pandas.core.arrays import ( Categorical, DatetimeArray, @@ -296,6 +293,21 @@ def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: # overriding IndexOpsMixin improves performance GH#38083 return self._data.searchsorted(value, side=side, sorter=sorter) + def putmask(self, mask, value) -> Index: + mask, noop = validate_putmask(self._data, mask) + if noop: + return self.copy() + + try: + self._validate_fill_value(value) + except (ValueError, TypeError): + dtype = self._find_common_type_compat(value) + return self.astype(dtype).putmask(mask, value) + + arr = self._data.copy() + arr.putmask(mask, value) + return type(self)._simple_new(arr, name=self.name) + # --------------------------------------------------------------------- def _get_engine_target(self) -> np.ndarray: @@ -322,9 +334,30 @@ def repeat(self, repeats, axis=None): result = self._data.repeat(repeats, axis=axis) return type(self)._simple_new(result, name=self.name) - def insert(self, loc: int, item): - # ExtensionIndex subclasses must override Index.insert - raise AbstractMethodError(self) + def insert(self, loc: int, item) -> Index: + """ + Make new Index inserting new item at location. Follows + Python list.append semantics for negative values. + + Parameters + ---------- + loc : int + item : object + + Returns + ------- + new_index : Index + """ + try: + result = self._data.insert(loc, item) + except (ValueError, TypeError): + # e.g. trying to insert an integer into a DatetimeIndex + # We cannot keep the same dtype, so cast to the (often object) + # minimal shared dtype before doing the insert. + dtype = self._find_common_type_compat(item) + return self.astype(dtype).insert(loc, item) + else: + return type(self)._simple_new(result, name=self.name) def _validate_fill_value(self, value): """ @@ -365,11 +398,17 @@ def astype(self, dtype, copy: bool = True) -> Index: return self return self.copy() - if isinstance(dtype, np.dtype) and dtype.kind == "M" and dtype != "M8[ns]": + if ( + isinstance(self.dtype, np.dtype) + and isinstance(dtype, np.dtype) + and dtype.kind == "M" + and dtype != "M8[ns]" + ): # For now Datetime supports this by unwrapping ndarray, but DTI doesn't - raise TypeError(f"Cannot cast {type(self._data).__name__} to dtype") + raise TypeError(f"Cannot cast {type(self).__name__} to dtype") - new_values = self._data.astype(dtype, copy=copy) + with rewrite_exception(type(self._data).__name__, type(self).__name__): + new_values = self._data.astype(dtype, copy=copy) # pass copy=False because any copying will be done in the # _data.astype call above @@ -419,60 +458,3 @@ def _get_engine_target(self) -> np.ndarray: def _from_join_target(self, result: np.ndarray) -> ArrayLike: assert result.dtype == self._data._ndarray.dtype return self._data._from_backing_data(result) - - def insert(self: _T, loc: int, item) -> Index: - """ - Make new Index inserting new item at location. Follows - Python list.append semantics for negative values. - - Parameters - ---------- - loc : int - item : object - - Returns - ------- - new_index : Index - - Raises - ------ - ValueError if the item is not valid for this dtype. - """ - arr = self._data - try: - code = arr._validate_scalar(item) - except (ValueError, TypeError): - # e.g. trying to insert an integer into a DatetimeIndex - # We cannot keep the same dtype, so cast to the (often object) - # minimal shared dtype before doing the insert. - dtype, _ = infer_dtype_from(item, pandas_dtype=True) - dtype = find_common_type([self.dtype, dtype]) - return self.astype(dtype).insert(loc, item) - else: - new_vals = np.concatenate( - ( - arr._ndarray[:loc], - np.asarray([code], dtype=arr._ndarray.dtype), - arr._ndarray[loc:], - ) - ) - new_arr = arr._from_backing_data(new_vals) - return type(self)._simple_new(new_arr, name=self.name) - - def putmask(self, mask, value) -> Index: - res_values = self._data.copy() - try: - res_values.putmask(mask, value) - except (TypeError, ValueError): - return self.astype(object).putmask(mask, value) - - return type(self)._simple_new(res_values, name=self.name) - - # error: Argument 1 of "_wrap_joined_index" is incompatible with supertype - # "Index"; supertype defines the argument type as "Union[ExtensionArray, ndarray]" - def _wrap_joined_index( # type: ignore[override] - self: _T, joined: NDArrayBackedExtensionArray, other: _T - ) -> _T: - name = get_op_result_name(self, other) - - return type(self)._simple_new(joined, name=name) diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index fc92a1b3afe53..06ab7fdbcf872 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -16,8 +16,6 @@ import numpy as np -from pandas._config import get_option - from pandas._libs import lib from pandas._libs.interval import ( Interval, @@ -69,7 +67,6 @@ take_nd, unique, ) -from pandas.core.array_algos.putmask import validate_putmask from pandas.core.arrays.interval import ( IntervalArray, _interval_shared_docs, @@ -80,7 +77,6 @@ from pandas.core.indexes.base import ( Index, _index_shared_docs, - default_pprint, ensure_index, maybe_extract_name, ) @@ -255,8 +251,6 @@ def func(self, other, sort=None): @inherit_names(["is_non_overlapping_monotonic", "closed"], IntervalArray, cache=True) class IntervalIndex(ExtensionIndex): _typ = "intervalindex" - _comparables = ["name"] - _attributes = ["name", "closed"] # annotate properties pinned via inherit_names closed: str @@ -422,21 +416,11 @@ def __contains__(self, key: Any) -> bool: def _multiindex(self) -> MultiIndex: return MultiIndex.from_arrays([self.left, self.right], names=["left", "right"]) - def __array_wrap__(self, result, context=None): - # we don't want the superclass implementation - return result - def __reduce__(self): - d = {"left": self.left, "right": self.right} + d = {"left": self.left, "right": self.right, "closed": self.closed} d.update(self._get_attributes_dict()) return _new_IntervalIndex, (type(self), d), None - @Appender(Index.astype.__doc__) - def astype(self, dtype, copy: bool = True): - with rewrite_exception("IntervalArray", type(self).__name__): - new_values = self._values.astype(dtype, copy=copy) - return Index(new_values, dtype=new_values.dtype, name=self.name) - @property def inferred_type(self) -> str: """Return a string of the type inferred from the values""" @@ -789,9 +773,11 @@ def _get_indexer_pointwise(self, target: Index) -> tuple[np.ndarray, np.ndarray] except KeyError: missing.append(i) locs = np.array([-1]) - except InvalidIndexError as err: - # i.e. non-scalar key - raise TypeError(key) from err + except InvalidIndexError: + # i.e. non-scalar key e.g. a tuple. + # see test_append_different_columns_types_raises + missing.append(i) + locs = np.array([-1]) indexer.append(locs) @@ -867,46 +853,6 @@ def mid(self) -> Index: def length(self) -> Index: return Index(self._data.length, copy=False) - def putmask(self, mask, value) -> Index: - mask, noop = validate_putmask(self._data, mask) - if noop: - return self.copy() - - try: - self._validate_fill_value(value) - except (ValueError, TypeError): - dtype = self._find_common_type_compat(value) - return self.astype(dtype).putmask(mask, value) - - arr = self._data.copy() - arr.putmask(mask, value) - return type(self)._simple_new(arr, name=self.name) - - def insert(self, loc: int, item): - """ - Return a new IntervalIndex inserting new item at location. Follows - Python list.append semantics for negative values. Only Interval - objects and NA can be inserted into an IntervalIndex - - Parameters - ---------- - loc : int - item : object - - Returns - ------- - IntervalIndex - """ - try: - result = self._data.insert(loc, item) - except (ValueError, TypeError): - # e.g trying to insert a string - dtype, _ = infer_dtype_from_scalar(item, pandas_dtype=True) - dtype = find_common_type([self.dtype, dtype]) - return self.astype(dtype).insert(loc, item) - - return type(self)._simple_new(result, name=self.name) - # -------------------------------------------------------------------- # Rendering Methods # __repr__ associated methods are based on MultiIndex @@ -919,49 +865,9 @@ def _format_native_types(self, na_rep="NaN", quoting=None, **kwargs): return super()._format_native_types(na_rep=na_rep, quoting=quoting, **kwargs) def _format_data(self, name=None) -> str: - # TODO: integrate with categorical and make generic # name argument is unused here; just for compat with base / categorical - n = len(self) - max_seq_items = min((get_option("display.max_seq_items") or n) // 10, 10) - - formatter = str - - if n == 0: - summary = "[]" - elif n == 1: - first = formatter(self[0]) - summary = f"[{first}]" - elif n == 2: - first = formatter(self[0]) - last = formatter(self[-1]) - summary = f"[{first}, {last}]" - else: - - if n > max_seq_items: - n = min(max_seq_items // 2, 10) - head = [formatter(x) for x in self[:n]] - tail = [formatter(x) for x in self[-n:]] - head_joined = ", ".join(head) - tail_joined = ", ".join(tail) - summary = f"[{head_joined} ... {tail_joined}]" - else: - tail = [formatter(x) for x in self] - joined = ", ".join(tail) - summary = f"[{joined}]" - - return summary + "," + self._format_space() - - def _format_attrs(self): - attrs = [] - if self.name is not None: - attrs.append(("name", default_pprint(self.name))) - attrs.append(("dtype", f"'{self.dtype}'")) - return attrs - - def _format_space(self) -> str: - space = " " * (len(type(self).__name__) + 1) - return f"\n{space}" + return self._data._format_data() + "," + self._format_space() # -------------------------------------------------------------------- # Set Operations @@ -1214,6 +1120,8 @@ def interval_range( if periods is not None: periods += 1 + breaks: np.ndarray | TimedeltaIndex | DatetimeIndex + if is_number(endpoint): # force consistency between start/end/freq (lower end if freq skips it) if com.all_not_none(start, end, freq): @@ -1239,16 +1147,8 @@ def interval_range( else: # delegate to the appropriate range function if isinstance(endpoint, Timestamp): - # error: Incompatible types in assignment (expression has type - # "DatetimeIndex", variable has type "ndarray") - breaks = date_range( # type: ignore[assignment] - start=start, end=end, periods=periods, freq=freq - ) + breaks = date_range(start=start, end=end, periods=periods, freq=freq) else: - # error: Incompatible types in assignment (expression has type - # "TimedeltaIndex", variable has type "ndarray") - breaks = timedelta_range( # type: ignore[assignment] - start=start, end=end, periods=periods, freq=freq - ) + breaks = timedelta_range(start=start, end=end, periods=periods, freq=freq) return IntervalIndex.from_breaks(breaks, name=name, closed=closed) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 1a3719233a1da..805420a83108a 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -41,6 +41,7 @@ from pandas.util._decorators import ( Appender, cache_readonly, + deprecate_nonkeyword_arguments, doc, ) @@ -89,11 +90,7 @@ lexsort_indexer, ) -from pandas.io.formats.printing import ( - format_object_attrs, - format_object_summary, - pprint_thing, -) +from pandas.io.formats.printing import pprint_thing if TYPE_CHECKING: from pandas import ( @@ -295,7 +292,6 @@ class MultiIndex(Index): _levels = FrozenList() _codes = FrozenList() _comparables = ["names"] - rename = Index.set_names sortorder: int | None @@ -807,6 +803,7 @@ def _set_levels( self._reset_cache() + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "levels"]) def set_levels( self, levels, level=None, inplace=None, verify_integrity: bool = True ): @@ -898,7 +895,7 @@ def set_levels( warnings.warn( "inplace is deprecated and will be removed in a future version.", FutureWarning, - stacklevel=2, + stacklevel=3, ) else: inplace = False @@ -994,6 +991,7 @@ def _set_codes( self._reset_cache() + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "codes"]) def set_codes(self, codes, level=None, inplace=None, verify_integrity: bool = True): """ Set new codes on MultiIndex. Defaults to returning new index. @@ -1061,7 +1059,7 @@ def set_codes(self, codes, level=None, inplace=None, verify_integrity: bool = Tr warnings.warn( "inplace is deprecated and will be removed in a future version.", FutureWarning, - stacklevel=2, + stacklevel=3, ) else: inplace = False @@ -1287,20 +1285,6 @@ def _formatter_func(self, tup): formatter_funcs = [level._formatter_func for level in self.levels] return tuple(func(val) for func, val in zip(formatter_funcs, tup)) - def _format_data(self, name=None) -> str: - """ - Return the formatted data as a unicode string - """ - return format_object_summary( - self, self._formatter_func, name=name, line_break_each_value=True - ) - - def _format_attrs(self): - """ - Return a list of tuples of the (attr,formatted_value). - """ - return format_object_attrs(self, include_dtype=False) - def _format_native_types(self, na_rep="nan", **kwargs): new_levels = [] new_codes = [] @@ -2557,9 +2541,11 @@ def reindex( elif (indexer >= 0).all(): target = self.take(indexer) else: - # hopefully? - target = MultiIndex.from_tuples(target) - + try: + target = MultiIndex.from_tuples(target) + except TypeError: + # not all tuples, see test_constructor_dict_multiindex_reindex_flat + return target, indexer if ( preserve_names and target.nlevels == self.nlevels @@ -3446,6 +3432,7 @@ def _reorder_indexer( new_order = np.arange(n)[indexer] elif is_list_like(k): # Generate a map with all level codes as sorted initially + k = algos.unique(k) key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len( self.levels[i] ) @@ -3574,14 +3561,20 @@ def equal_levels(self, other: MultiIndex) -> bool: def _union(self, other, sort) -> MultiIndex: other, result_names = self._convert_can_do_setop(other) + if ( + any(-1 in code for code in self.codes) + and any(-1 in code for code in self.codes) + or self.has_duplicates + or other.has_duplicates + ): + # This is only necessary if both sides have nans or one has dups, + # fast_unique_multiple is faster + result = super()._union(other, sort) + else: + rvals = other._values.astype(object, copy=False) + result = lib.fast_unique_multiple([self._values, rvals], sort=sort) - # We could get here with CategoricalIndex other - rvals = other._values.astype(object, copy=False) - uniq_tuples = lib.fast_unique_multiple([self._values, rvals], sort=sort) - - return MultiIndex.from_arrays( - zip(*uniq_tuples), sortorder=0, names=result_names - ) + return MultiIndex.from_arrays(zip(*result), sortorder=0, names=result_names) def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return is_object_dtype(dtype) @@ -3594,7 +3587,9 @@ def _get_reconciled_name_object(self, other) -> MultiIndex: """ names = self._maybe_match_names(other) if self.names != names: - return self.rename(names) + # Incompatible return value type (got "Optional[MultiIndex]", expected + # "MultiIndex") + return self.rename(names) # type: ignore[return-value] return self def _maybe_match_names(self, other): @@ -3793,6 +3788,16 @@ def isin(self, values, level=None) -> np.ndarray: return np.zeros(len(levs), dtype=np.bool_) return levs.isin(values) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "names"]) + def set_names(self, names, level=None, inplace: bool = False) -> MultiIndex | None: + return super().set_names(names=names, level=level, inplace=inplace) + + rename = set_names + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def drop_duplicates(self, keep: str | bool = "first") -> MultiIndex: + return super().drop_duplicates(keep=keep) + # --------------------------------------------------------------- # Arithmetic/Numeric Methods - Disabled diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index 136843938b683..c1104b80a0a7a 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -4,10 +4,7 @@ datetime, timedelta, ) -from typing import ( - Any, - Hashable, -) +from typing import Hashable import warnings import numpy as np @@ -34,7 +31,6 @@ from pandas.util._decorators import doc from pandas.core.dtypes.common import ( - is_bool_dtype, is_datetime64_any_dtype, is_float, is_integer, @@ -153,14 +149,11 @@ class PeriodIndex(DatetimeIndexOpsMixin): -------- >>> idx = pd.PeriodIndex(year=[2000, 2002], quarter=[1, 3]) >>> idx - PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]', freq='Q-DEC') + PeriodIndex(['2000Q1', '2002Q3'], dtype='period[Q-DEC]') """ _typ = "periodindex" - _attributes = ["name", "freq"] - - # define my properties & methods for delegation - _is_numeric_dtype = False + _attributes = ["name"] _data: PeriodArray freq: BaseOffset @@ -322,70 +315,9 @@ def _is_comparable_dtype(self, dtype: DtypeObj) -> bool: return False return dtype.freq == self.freq - # ------------------------------------------------------------------------ - # Rendering Methods - - def _mpl_repr(self) -> np.ndarray: - # how to represent ourselves to matplotlib - return self.astype(object)._values - - # ------------------------------------------------------------------------ - # Indexing - - @doc(Index.__contains__) - def __contains__(self, key: Any) -> bool: - if isinstance(key, Period): - if key.freq != self.freq: - return False - else: - return key.ordinal in self._engine - else: - hash(key) - try: - self.get_loc(key) - return True - except KeyError: - return False - # ------------------------------------------------------------------------ # Index Methods - def __array_wrap__(self, result, context=None): - """ - Gets called after a ufunc and other functions. - - Needs additional handling as PeriodIndex stores internal data as int - dtype - - Replace this to __numpy_ufunc__ in future version and implement - __array_function__ for Indexes - """ - if isinstance(context, tuple) and len(context) > 0: - func = context[0] - if func is np.add: - pass - elif func is np.subtract: - name = self.name - left = context[1][0] - right = context[1][1] - if isinstance(left, PeriodIndex) and isinstance(right, PeriodIndex): - name = left.name if left.name == right.name else None - return Index(result, name=name) - elif isinstance(left, Period) or isinstance(right, Period): - return Index(result, name=name) - elif isinstance(func, np.ufunc): - if "M->M" not in func.types: - msg = f"ufunc '{func.__name__}' not supported for the PeriodIndex" - # This should be TypeError, but TypeError cannot be raised - # from here because numpy catches. - raise ValueError(msg) - - if is_bool_dtype(result): - return result - # the result is object dtype array of Period - # cannot pass _simple_new as it is - return type(self)(result, freq=self.freq, name=self.name) - def asof_locs(self, where: Index, mask: np.ndarray) -> np.ndarray: """ where : array of timestamps @@ -519,6 +451,8 @@ def get_loc(self, key, method=None, tolerance=None): elif is_integer(key): # Period constructor will cast to string, which we dont want raise KeyError(key) + elif isinstance(key, Period) and key.freq != self.freq: + raise KeyError(key) try: key = Period(key, freq=self.freq) @@ -636,7 +570,7 @@ def period_range( PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06', '2017-07', '2017-08', '2017-09', '2017-10', '2017-11', '2017-12', '2018-01'], - dtype='period[M]', freq='M') + dtype='period[M]') If ``start`` or ``end`` are ``Period`` objects, they will be used as anchor endpoints for a ``PeriodIndex`` with frequency matching that of the @@ -645,7 +579,7 @@ def period_range( >>> pd.period_range(start=pd.Period('2017Q1', freq='Q'), ... end=pd.Period('2017Q2', freq='Q'), freq='M') PeriodIndex(['2017-03', '2017-04', '2017-05', '2017-06'], - dtype='period[M]', freq='M') + dtype='period[M]') """ if com.count_not_none(start, end, periods) != 2: raise ValueError( diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0e6fb77e8b51b..ead1a2a4a544b 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -604,11 +604,6 @@ def _min_fitting_element(self, lower_limit: int) -> int: no_steps = -(-(lower_limit - self.start) // abs(self.step)) return self.start + abs(self.step) * no_steps - def _max_fitting_element(self, upper_limit: int) -> int: - """Returns the largest element smaller than or equal to the limit""" - no_steps = (upper_limit - self.start) // abs(self.step) - return self.start + abs(self.step) * no_steps - def _extended_gcd(self, a: int, b: int) -> tuple[int, int, int]: """ Extended Euclidean algorithms to solve Bezout's identity: diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index ec97fa1e05851..cb83a0bccc748 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -112,10 +112,6 @@ class TimedeltaIndex(DatetimeTimedeltaMixin): _data_cls = TimedeltaArray _engine_type = libindex.TimedeltaEngine - _comparables = ["name", "freq"] - _attributes = ["name", "freq"] - _is_numeric_dtype = False - _data: TimedeltaArray # ------------------------------------------------------------------- diff --git a/pandas/core/indexing.py b/pandas/core/indexing.py index 0a06dff790cbf..d5555561088eb 100644 --- a/pandas/core/indexing.py +++ b/pandas/core/indexing.py @@ -886,26 +886,22 @@ def _getitem_nested_tuple(self, tup: tuple): # handle the multi-axis by taking sections and reducing # this is iterative obj = self.obj - axis = 0 - for key in tup: + # GH#41369 Loop in reverse order ensures indexing along columns before rows + # which selects only necessary blocks which avoids dtype conversion if possible + axis = len(tup) - 1 + for key in tup[::-1]: if com.is_null_slice(key): - axis += 1 + axis -= 1 continue - current_ndim = obj.ndim obj = getattr(obj, self.name)._getitem_axis(key, axis=axis) - axis += 1 + axis -= 1 # if we have a scalar, we are done if is_scalar(obj) or not hasattr(obj, "ndim"): break - # has the dim of the obj changed? - # GH 7199 - if obj.ndim < current_ndim: - axis -= 1 - return obj def _convert_to_indexer(self, key, axis: int, is_setter: bool = False): @@ -1938,7 +1934,9 @@ def _setitem_with_indexer_missing(self, indexer, value): # e.g. 0.0 -> 0 # GH#12246 if index.is_unique: - new_indexer = index.get_indexer([new_index[-1]]) + # pass new_index[-1:] instead if [new_index[-1]] + # so that we retain dtype + new_indexer = index.get_indexer(new_index[-1:]) if (new_indexer != -1).any(): # We get only here with loc, so can hard code return self._setitem_with_indexer(new_indexer, value, "loc") diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 4f1b16e747394..c7769046c70b2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -6,6 +6,8 @@ TYPE_CHECKING, Any, Callable, + Iterable, + Sequence, cast, ) import warnings @@ -393,7 +395,7 @@ def reduce(self, func, ignore_failures: bool = False) -> list[Block]: return [] raise - if np.ndim(result) == 0: + if self.values.ndim == 1: # TODO(EA2D): special case not needed with 2D EAs res_values = np.array([[result]]) else: @@ -763,8 +765,8 @@ def _replace_regex( @final def _replace_list( self, - src_list: list[Any], - dest_list: list[Any], + src_list: Iterable[Any], + dest_list: Sequence[Any], inplace: bool = False, regex: bool = False, ) -> list[Block]: @@ -779,6 +781,14 @@ def _replace_list( # so un-tile here return self.replace(src_list, dest_list[0], inplace, regex) + # https://github.com/pandas-dev/pandas/issues/40371 + # the following pairs check code caused a regression so we catch that case here + # until the issue is fixed properly in can_hold_element + + # error: "Iterable[Any]" has no attribute "tolist" + if hasattr(src_list, "tolist"): + src_list = src_list.tolist() # type: ignore[attr-defined] + # Exclude anything that we know we won't contain pairs = [ (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x) @@ -1316,7 +1326,6 @@ def quantile( assert is_list_like(qs) # caller is responsible for this result = quantile_compat(self.values, np.asarray(qs._values), interpolation) - return new_block(result, placement=self._mgr_locs, ndim=2) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 5e58f6148e6ad..270eddf2bd3a5 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -27,7 +27,6 @@ from pandas.core.dtypes.cast import ( construct_1d_arraylike_from_scalar, construct_1d_ndarray_preserving_na, - dict_compat, maybe_cast_to_datetime, maybe_convert_platform, maybe_infer_to_datetimelike, @@ -47,10 +46,7 @@ from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.generic import ( ABCDataFrame, - ABCDatetimeIndex, - ABCIndex, ABCSeries, - ABCTimedeltaIndex, ) from pandas.core import ( @@ -64,8 +60,10 @@ TimedeltaArray, ) from pandas.core.construction import ( + create_series_with_explicit_dtype, ensure_wrapped_if_datetimelike, extract_array, + range_to_ndarray, sanitize_array, ) from pandas.core.indexes import base as ibase @@ -100,7 +98,7 @@ def arrays_to_mgr( arrays, - arr_names, + arr_names: Index, index, columns, *, @@ -114,8 +112,6 @@ def arrays_to_mgr( Needs to handle a lot of exceptional cases. """ - arr_names = ensure_index(arr_names) - if verify_integrity: # figure out the index, if necessary if index is None: @@ -285,10 +281,12 @@ def ndarray_to_mgr( if columns is None: columns = Index(range(len(values))) + else: + columns = ensure_index(columns) return arrays_to_mgr(values, columns, index, columns, dtype=dtype, typ=typ) - if is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): + elif is_extension_array_dtype(vdtype) and not is_1d_only_ea_dtype(vdtype): # i.e. Datetime64TZ values = extract_array(values, extract_numpy=True) if copy: @@ -453,7 +451,7 @@ def dict_to_mgr( arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] # GH#24096 need copy to be deep for datetime64tz case # TODO: See if we can avoid these copies - arrays = [arr if not isinstance(arr, ABCIndex) else arr._data for arr in arrays] + arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays] arrays = [ arr if not is_datetime64tz_dtype(arr) else arr.copy() for arr in arrays ] @@ -479,7 +477,7 @@ def nested_data_to_arrays( columns: Index | None, index: Index | None, dtype: DtypeObj | None, -): +) -> tuple[list[ArrayLike], Index, Index]: """ Convert a single sequence of arrays to multiple arrays. """ @@ -530,15 +528,12 @@ def _prep_ndarray(values, copy: bool = True) -> np.ndarray: if len(values) == 0: return np.empty((0, 0), dtype=object) elif isinstance(values, range): - arr = np.arange(values.start, values.stop, values.step, dtype="int64") + arr = range_to_ndarray(values) return arr[..., np.newaxis] def convert(v): if not is_list_like(v) or isinstance(v, ABCDataFrame): return v - elif not hasattr(v, "dtype") and not isinstance(v, (list, tuple, range)): - # TODO: should we cast these to list? - return v v = extract_array(v, extract_numpy=True) res = maybe_convert_platform(v) @@ -550,7 +545,7 @@ def convert(v): if is_list_like(values[0]): values = np.array([convert(v) for v in values]) elif isinstance(values[0], np.ndarray) and values[0].ndim == 0: - # GH#21861 + # GH#21861 see test_constructor_list_of_lists values = np.array([convert(v) for v in values]) else: values = convert(values) @@ -568,33 +563,25 @@ def convert(v): return values -def _homogenize(data, index: Index, dtype: DtypeObj | None): - oindex = None +def _homogenize(data, index: Index, dtype: DtypeObj | None) -> list[ArrayLike]: homogenized = [] for val in data: if isinstance(val, ABCSeries): if dtype is not None: - val = val.astype(dtype) + val = val.astype(dtype, copy=False) if val.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later val = val.reindex(index, copy=False) - # TODO extract_array should be preferred, but that gives failures for - # `extension/test_numpy.py` (extract_array will convert numpy arrays - # to PandasArray), see https://github.com/pandas-dev/pandas/issues/40021 - # val = extract_array(val, extract_numpy=True) + val = val._values else: if isinstance(val, dict): - if oindex is None: - oindex = index.astype("O") - - if isinstance(index, (ABCDatetimeIndex, ABCTimedeltaIndex)): - val = dict_compat(val) - else: - val = dict(val) - val = lib.fast_multiget(val, oindex._values, default=np.nan) + # see test_constructor_subclass_dict + # test_constructor_dict_datetime64_index + val = create_series_with_explicit_dtype(val, index=index)._values + val = sanitize_array( val, index, dtype=dtype, copy=False, raise_cast_failure=False ) @@ -751,6 +738,7 @@ def to_arrays( Return list of arrays, columns. """ if isinstance(data, ABCDataFrame): + # see test_from_records_with_index_data, test_from_records_bad_index_column if columns is not None: arrays = [ data._ixs(i, axis=1).values @@ -886,7 +874,7 @@ def _list_of_dict_to_arrays( # assure that they are of the base dict class and not of derived # classes - data = [(type(d) is dict) and d or dict(d) for d in data] + data = [d if type(d) is dict else dict(d) for d in data] content = lib.dicts_to_array(data, list(columns)) return content, columns diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index b8909f16ee876..673c482bced18 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -245,8 +245,7 @@ def _maybe_get_mask( """ if mask is None: if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype): - # Boolean data cannot contain nulls, so signal via mask being None - return None + return np.broadcast_to(False, values.shape) if skipna or needs_i8_conversion(values.dtype): mask = isna(values) diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 4791bbf0ba7f7..c05130278f75b 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -27,7 +27,6 @@ ArrayLike, DtypeObj, FrameOrSeries, - FrameOrSeriesUnion, IndexLabel, Suffixes, ) @@ -81,15 +80,18 @@ from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: - from pandas import DataFrame + from pandas import ( + DataFrame, + Series, + ) from pandas.core.arrays import DatetimeArray -@Substitution("\nleft : DataFrame") +@Substitution("\nleft : DataFrame or named Series") @Appender(_merge_doc, indents=0) def merge( - left: FrameOrSeriesUnion, - right: FrameOrSeriesUnion, + left: DataFrame | Series, + right: DataFrame | Series, how: str = "inner", on: IndexLabel | None = None, left_on: IndexLabel | None = None, @@ -322,8 +324,8 @@ def _merger(x, y) -> DataFrame: def merge_asof( - left: DataFrame, - right: DataFrame, + left: DataFrame | Series, + right: DataFrame | Series, on: IndexLabel | None = None, left_on: IndexLabel | None = None, right_on: IndexLabel | None = None, @@ -362,8 +364,8 @@ def merge_asof( Parameters ---------- - left : DataFrame - right : DataFrame + left : DataFrame or named Series + right : DataFrame or named Series on : label Field name to join on. Must be found in both DataFrames. The data MUST be ordered. Furthermore this must be a numeric column, @@ -608,8 +610,8 @@ class _MergeOperation: def __init__( self, - left: FrameOrSeriesUnion, - right: FrameOrSeriesUnion, + left: DataFrame | Series, + right: DataFrame | Series, how: str = "inner", on: IndexLabel | None = None, left_on: IndexLabel | None = None, @@ -1473,7 +1475,7 @@ def get_join_indexers( for n in range(len(left_keys)) ) zipped = zip(*mapped) - llab, rlab, shape = [list(x) for x in zipped] + llab, rlab, shape = (list(x) for x in zipped) # get flat i8 keys from label lists lkey, rkey = _get_join_keys(llab, rlab, shape, sort) @@ -1599,8 +1601,8 @@ class _OrderedMerge(_MergeOperation): def __init__( self, - left: DataFrame, - right: DataFrame, + left: DataFrame | Series, + right: DataFrame | Series, on: IndexLabel | None = None, left_on: IndexLabel | None = None, right_on: IndexLabel | None = None, @@ -1704,8 +1706,8 @@ class _AsOfMerge(_OrderedMerge): def __init__( self, - left: DataFrame, - right: DataFrame, + left: DataFrame | Series, + right: DataFrame | Series, on: IndexLabel | None = None, left_on: IndexLabel | None = None, right_on: IndexLabel | None = None, @@ -1983,7 +1985,7 @@ def _get_multiindex_indexer( for n in range(index.nlevels) ) zipped = zip(*mapped) - rcodes, lcodes, shape = [list(x) for x in zipped] + rcodes, lcodes, shape = (list(x) for x in zipped) if sort: rcodes = list(map(np.take, rcodes, index.codes)) else: diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 7b9c3883d74e3..64daf2542e15a 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -250,7 +250,7 @@ def cut( raise ValueError("Cannot cut empty array") rng = (nanops.nanmin(x), nanops.nanmax(x)) - mn, mx = [mi + 0.0 for mi in rng] + mn, mx = (mi + 0.0 for mi in rng) if np.isinf(mn) or np.isinf(mx): # GH 24314 diff --git a/pandas/core/series.py b/pandas/core/series.py index d8b7876028839..2f45a2adbdec7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1308,6 +1308,7 @@ def repeat(self, repeats, axis=None) -> Series: self, method="repeat" ) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "level"]) def reset_index(self, level=None, drop=False, name=None, inplace=False): """ Generate a new DataFrame or Series with the index reset. @@ -2057,6 +2058,7 @@ def drop_duplicates(self, *, inplace: Literal[True]) -> None: def drop_duplicates(self, keep=..., inplace: bool = ...) -> Series | None: ... + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def drop_duplicates(self, keep="first", inplace=False) -> Series | None: """ Return Series with duplicate values removed. @@ -3257,6 +3259,7 @@ def update(self, other) -> None: # ---------------------------------------------------------------------- # Reindexing, sorting + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def sort_values( self, axis=0, @@ -3467,6 +3470,7 @@ def sort_values( else: return result.__finalize__(self, method="sort_values") + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def sort_index( self, axis=0, @@ -4479,6 +4483,7 @@ def set_axis(self, labels, *, inplace: Literal[True]) -> None: def set_axis(self, labels, axis: Axis = ..., inplace: bool = ...) -> Series | None: ... + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) @Appender( """ Examples @@ -4518,6 +4523,7 @@ def set_axis(self, labels, axis: Axis = 0, inplace: bool = False): def reindex(self, index=None, **kwargs): return super().reindex(index=index, **kwargs) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "labels"]) def drop( self, labels=None, @@ -5065,10 +5071,7 @@ def _convert_dtypes( convert_boolean, convert_floating, ) - try: - result = input_series.astype(inferred_dtype) - except TypeError: - result = input_series.copy() + result = input_series.astype(inferred_dtype) else: result = input_series.copy() return result @@ -5093,6 +5096,7 @@ def notna(self) -> Series: def notnull(self) -> Series: return super().notnull() + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) def dropna(self, axis=0, inplace=False, how=None): """ Return a new Series with missing values removed. @@ -5290,6 +5294,40 @@ def to_period(self, freq=None, copy=True) -> Series: self, method="to_period" ) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def ffill( + self: Series, + axis: None | Axis = None, + inplace: bool = False, + limit: None | int = None, + downcast=None, + ) -> Series | None: + return super().ffill(axis, inplace, limit, downcast) + + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self"]) + def bfill( + self: Series, + axis: None | Axis = None, + inplace: bool = False, + limit: None | int = None, + downcast=None, + ) -> Series | None: + return super().bfill(axis, inplace, limit, downcast) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "lower", "upper"] + ) + def clip( + self: Series, + lower=None, + upper=None, + axis: Axis | None = None, + inplace: bool = False, + *args, + **kwargs, + ) -> Series | None: + return super().clip(lower, upper, axis, inplace, *args, **kwargs) + @deprecate_nonkeyword_arguments(version=None, allowed_args=["self", "method"]) def interpolate( self: Series, @@ -5313,6 +5351,36 @@ def interpolate( **kwargs, ) + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "cond", "other"] + ) + def where( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=lib.no_default, + ): + return super().where(cond, other, inplace, axis, level, errors, try_cast) + + @deprecate_nonkeyword_arguments( + version=None, allowed_args=["self", "cond", "other"] + ) + def mask( + self, + cond, + other=np.nan, + inplace=False, + axis=None, + level=None, + errors="raise", + try_cast=lib.no_default, + ): + return super().mask(cond, other, inplace, axis, level, errors, try_cast) + # ---------------------------------------------------------------------- # Add index _AXIS_ORDERS = ["index"] diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 98d209ae4a899..7643019ff8c55 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -13,10 +13,7 @@ import numpy as np import pandas._libs.lib as lib -from pandas._typing import ( - ArrayLike, - FrameOrSeriesUnion, -) +from pandas._typing import FrameOrSeriesUnion from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -160,7 +157,6 @@ class StringMethods(NoNewAttributesMixin): # TODO: Dispatch all the methods # Currently the following are not dispatched to the array # * cat - # * extract # * extractall def __init__(self, data): @@ -243,7 +239,7 @@ def _wrap_result( self, result, name=None, - expand=None, + expand: bool | None = None, fill_value=np.nan, returns_string=True, ): @@ -1358,14 +1354,13 @@ def replace( "*not* be treated as literal strings when regex=True." ) warnings.warn(msg, FutureWarning, stacklevel=3) - regex = True # Check whether repl is valid (GH 13438, GH 15055) if not (isinstance(repl, str) or callable(repl)): raise TypeError("repl must be a string or callable") is_compiled_re = is_re(pat) - if regex: + if regex or regex is None: if is_compiled_re and (case is not None or flags != 0): raise ValueError( "case and flags cannot be set when pat is a compiled regex" @@ -1378,6 +1373,14 @@ def replace( elif callable(repl): raise ValueError("Cannot use a callable replacement when regex=False") + # The current behavior is to treat single character patterns as literal strings, + # even when ``regex`` is set to ``True``. + if isinstance(pat, str) and len(pat) == 1: + regex = False + + if regex is None: + regex = True + if case is None: case = True @@ -2378,6 +2381,8 @@ def extract( 2 NaN dtype: object """ + from pandas import DataFrame + if not isinstance(expand, bool): raise ValueError("expand must be True or False") @@ -2388,8 +2393,37 @@ def extract( if not expand and regex.groups > 1 and isinstance(self._data, ABCIndex): raise ValueError("only one regex group is supported with Index") - # TODO: dispatch - return str_extract(self, pat, flags, expand=expand) + obj = self._data + result_dtype = _result_dtype(obj) + + returns_df = regex.groups > 1 or expand + + if returns_df: + name = None + columns = _get_group_names(regex) + + if obj.array.size == 0: + result = DataFrame(columns=columns, dtype=result_dtype) + + else: + result_list = self._data.array._str_extract( + pat, flags=flags, expand=returns_df + ) + + result_index: Index | None + if isinstance(obj, ABCSeries): + result_index = obj.index + else: + result_index = None + + result = DataFrame( + result_list, columns=columns, index=result_index, dtype=result_dtype + ) + + else: + name = _get_single_group_name(regex) + result = self._data.array._str_extract(pat, flags=flags, expand=returns_df) + return self._wrap_result(result, name=name) @forbid_nonstring_types(["bytes"]) def extractall(self, pat, flags=0): @@ -3076,72 +3110,6 @@ def _get_group_names(regex: re.Pattern) -> list[Hashable]: return [names.get(1 + i, i) for i in range(regex.groups)] -def _str_extract(arr: ArrayLike, pat: str, flags=0, expand: bool = True): - """ - Find groups in each string in the array using passed regular expression. - - Returns - ------- - np.ndarray or list of lists is expand is True - """ - regex = re.compile(pat, flags=flags) - - empty_row = [np.nan] * regex.groups - - def f(x): - if not isinstance(x, str): - return empty_row - m = regex.search(x) - if m: - return [np.nan if item is None else item for item in m.groups()] - else: - return empty_row - - if expand: - return [f(val) for val in np.asarray(arr)] - - return np.array([f(val)[0] for val in np.asarray(arr)], dtype=object) - - -def str_extract(accessor: StringMethods, pat: str, flags: int = 0, expand: bool = True): - from pandas import ( - DataFrame, - array as pd_array, - ) - - obj = accessor._data - result_dtype = _result_dtype(obj) - regex = re.compile(pat, flags=flags) - returns_df = regex.groups > 1 or expand - - if returns_df: - name = None - columns = _get_group_names(regex) - - if obj.array.size == 0: - result = DataFrame(columns=columns, dtype=result_dtype) - - else: - result_list = _str_extract(obj.array, pat, flags=flags, expand=returns_df) - - result_index: Index | None - if isinstance(obj, ABCSeries): - result_index = obj.index - else: - result_index = None - - result = DataFrame( - result_list, columns=columns, index=result_index, dtype=result_dtype - ) - - else: - name = _get_single_group_name(regex) - result_arr = _str_extract(obj.array, pat, flags=flags, expand=returns_df) - # not dispatching, so we have to reconstruct here. - result = pd_array(result_arr, dtype=result_dtype) - return accessor._wrap_result(result, name=name) - - def str_extractall(arr, pat, flags=0): regex = re.compile(pat, flags=flags) # the regex must contain capture groups. diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index 730870b448cb2..cd71844d3b527 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -230,3 +230,7 @@ def _str_split(self, pat=None, n=-1, expand=False): @abc.abstractmethod def _str_rsplit(self, pat=None, n=-1): pass + + @abc.abstractmethod + def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): + pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index fb9fd77d21732..7ce4abe904f3b 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -32,7 +32,9 @@ def __len__(self): # For typing, _str_map relies on the object being sized. raise NotImplementedError - def _str_map(self, f, na_value=None, dtype: Dtype | None = None): + def _str_map( + self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True + ): """ Map a callable over valid element of the array. @@ -47,6 +49,8 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): for object-dtype and Categorical and ``pd.NA`` for StringArray. dtype : Dtype, optional The dtype of the result array. + convert : bool, default True + Whether to call `maybe_convert_objects` on the resulting ndarray """ if dtype is None: dtype = np.dtype("object") @@ -60,9 +64,9 @@ def _str_map(self, f, na_value=None, dtype: Dtype | None = None): arr = np.asarray(self, dtype=object) mask = isna(arr) - convert = not np.all(mask) + map_convert = convert and not np.all(mask) try: - result = lib.map_infer_mask(arr, f, mask.view(np.uint8), convert) + result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert) except (TypeError, AttributeError) as e: # Reraise the exception if callable `f` got wrong number of args. # The user may want to be warned by this, instead of getting NaN @@ -88,7 +92,7 @@ def g(x): return result if na_value is not np.nan: np.putmask(result, mask, na_value) - if result.dtype == object: + if convert and result.dtype == object: result = lib.maybe_convert_objects(result) return result @@ -145,10 +149,10 @@ def _str_replace( # add case flag, if provided flags |= re.IGNORECASE - if regex and ( - isinstance(pat, re.Pattern) or len(pat) > 1 or flags or callable(repl) - ): + if regex or flags or callable(repl): if not isinstance(pat, re.Pattern): + if regex is False: + pat = re.escape(pat) pat = re.compile(pat, flags=flags) n = n if n >= 0 else 0 @@ -410,3 +414,28 @@ def _str_lstrip(self, to_strip=None): def _str_rstrip(self, to_strip=None): return self._str_map(lambda x: x.rstrip(to_strip)) + + def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): + regex = re.compile(pat, flags=flags) + na_value = self._str_na_value + + if not expand: + + def g(x): + m = regex.search(x) + return m.groups()[0] if m else na_value + + return self._str_map(g, convert=False) + + empty_row = [na_value] * regex.groups + + def f(x): + if not isinstance(x, str): + return empty_row + m = regex.search(x) + if m: + return [na_value if item is None else item for item in m.groups()] + else: + return empty_row + + return [f(val) for val in np.asarray(self)] diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index d85aa20de5ab4..e0720c5d86df1 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -1,7 +1,6 @@ """Common utility functions for rolling operations""" from collections import defaultdict from typing import cast -import warnings import numpy as np @@ -15,17 +14,7 @@ def flex_binary_moment(arg1, arg2, f, pairwise=False): - if not ( - isinstance(arg1, (np.ndarray, ABCSeries, ABCDataFrame)) - and isinstance(arg2, (np.ndarray, ABCSeries, ABCDataFrame)) - ): - raise TypeError( - "arguments to moment function must be of type np.ndarray/Series/DataFrame" - ) - - if isinstance(arg1, (np.ndarray, ABCSeries)) and isinstance( - arg2, (np.ndarray, ABCSeries) - ): + if isinstance(arg1, ABCSeries) and isinstance(arg2, ABCSeries): X, Y = prep_binary(arg1, arg2) return f(X, Y) @@ -43,7 +32,7 @@ def dataframe_from_int_dict(data, frame_template): if pairwise is False: if arg1 is arg2: # special case in order to handle duplicate column names - for i, col in enumerate(arg1.columns): + for i in range(len(arg1.columns)): results[i] = f(arg1.iloc[:, i], arg2.iloc[:, i]) return dataframe_from_int_dict(results, arg1) else: @@ -51,23 +40,17 @@ def dataframe_from_int_dict(data, frame_template): raise ValueError("'arg1' columns are not unique") if not arg2.columns.is_unique: raise ValueError("'arg2' columns are not unique") - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - X, Y = arg1.align(arg2, join="outer") - X = X + 0 * Y - Y = Y + 0 * X - - with warnings.catch_warnings(record=True): - warnings.simplefilter("ignore", RuntimeWarning) - res_columns = arg1.columns.union(arg2.columns) + X, Y = arg1.align(arg2, join="outer") + X, Y = prep_binary(X, Y) + res_columns = arg1.columns.union(arg2.columns) for col in res_columns: if col in X and col in Y: results[col] = f(X[col], Y[col]) return DataFrame(results, index=X.index, columns=res_columns) elif pairwise is True: results = defaultdict(dict) - for i, k1 in enumerate(arg1.columns): - for j, k2 in enumerate(arg2.columns): + for i in range(len(arg1.columns)): + for j in range(len(arg2.columns)): if j < i and arg2 is arg1: # Symmetric case results[i][j] = results[j][i] @@ -85,10 +68,10 @@ def dataframe_from_int_dict(data, frame_template): result = concat( [ concat( - [results[i][j] for j, c in enumerate(arg2.columns)], + [results[i][j] for j in range(len(arg2.columns))], ignore_index=True, ) - for i, c in enumerate(arg1.columns) + for i in range(len(arg1.columns)) ], ignore_index=True, axis=1, @@ -135,13 +118,10 @@ def dataframe_from_int_dict(data, frame_template): ) return result - - else: - raise ValueError("'pairwise' is not True/False") else: results = { i: f(*prep_binary(arg1.iloc[:, i], arg2)) - for i, col in enumerate(arg1.columns) + for i in range(len(arg1.columns)) } return dataframe_from_int_dict(results, arg1) @@ -165,11 +145,7 @@ def zsqrt(x): def prep_binary(arg1, arg2): - if not isinstance(arg2, type(arg1)): - raise Exception("Input arrays must be of the same type!") - # mask out values, this also makes a common index... X = arg1 + 0 * arg2 Y = arg2 + 0 * arg1 - return X, Y diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index 0ef0896df8d44..2d5f148a6437a 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -291,6 +291,7 @@ def __repr__(self) -> str: def __iter__(self): obj = self._create_data(self._selected_obj) + obj = obj.set_axis(self._on) indexer = self._get_window_indexer() start, end = indexer.get_window_bounds( @@ -471,6 +472,8 @@ def _apply_pairwise( other = target # only default unset pairwise = True if pairwise is None else pairwise + elif not isinstance(other, (ABCDataFrame, ABCSeries)): + raise ValueError("other must be a DataFrame or Series") return flex_binary_moment(target, other, func, pairwise=bool(pairwise)) diff --git a/pandas/io/clipboards.py b/pandas/io/clipboards.py index 00a99eb8a4480..a6940c08198b0 100644 --- a/pandas/io/clipboards.py +++ b/pandas/io/clipboards.py @@ -58,9 +58,14 @@ def read_clipboard(sep=r"\s+", **kwargs): # pragma: no cover # 0 1 2 # 1 3 4 - counts = {x.lstrip().count("\t") for x in lines} + counts = {x.lstrip(" ").count("\t") for x in lines} if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: sep = "\t" + # check the number of leading tabs in the first line + # to account for index columns + index_length = len(lines[0]) - len(lines[0].lstrip(" \t")) + if index_length != 0: + kwargs.setdefault("index_col", list(range(index_length))) # Edge case where sep is specified to be None, return to default if sep is None and kwargs.get("delim_whitespace") is None: diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 9b8e40a977545..42ca68376452d 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -2,7 +2,6 @@ import abc import datetime -import inspect from io import BytesIO import os from textwrap import fill @@ -33,6 +32,7 @@ deprecate_nonkeyword_arguments, doc, ) +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import ( is_bool, @@ -245,6 +245,10 @@ Convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric data will be read in as floats: Excel stores all numbers as floats internally. + + .. deprecated:: 1.3.0 + convert_float will be removed in a future version + mangle_dupe_cols : bool, default True Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there @@ -355,7 +359,7 @@ def read_excel( thousands=None, comment=None, skipfooter=0, - convert_float=True, + convert_float=None, mangle_dupe_cols=True, storage_options: StorageOptions = None, ): @@ -489,11 +493,21 @@ def parse( thousands=None, comment=None, skipfooter=0, - convert_float=True, + convert_float=None, mangle_dupe_cols=True, **kwds, ): + if convert_float is None: + convert_float = True + else: + stacklevel = find_stack_level() + warnings.warn( + "convert_float is deprecated and will be removed in a future version", + FutureWarning, + stacklevel=stacklevel, + ) + validate_header_arg(header) ret_dict = False @@ -1206,16 +1220,7 @@ def __init__( f"only the xls format is supported. Install openpyxl instead." ) elif ext and ext != "xls": - caller = inspect.stack()[1] - if ( - caller.filename.endswith( - os.path.join("pandas", "io", "excel", "_base.py") - ) - and caller.function == "read_excel" - ): - stacklevel = 4 - else: - stacklevel = 2 + stacklevel = find_stack_level() warnings.warn( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install " @@ -1251,7 +1256,7 @@ def parse( thousands=None, comment=None, skipfooter=0, - convert_float=True, + convert_float=None, mangle_dupe_cols=True, **kwds, ): diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 648df0ff2b6d9..c6ff4e2180893 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -773,7 +773,7 @@ def _generate_body(self, coloffset: int) -> Iterable[ExcelCell]: series = self.df.iloc[:, colidx] for i, val in enumerate(series): if styles is not None: - css = ";".join([a + ":" + str(v) for (a, v) in styles[i, colidx]]) + css = ";".join(a + ":" + str(v) for (a, v) in styles[i, colidx]) xlstyle = self.style_converter(css) yield ExcelCell(self.rowcounter + i, colidx + coloffset, val, xlstyle) diff --git a/pandas/io/formats/latex.py b/pandas/io/formats/latex.py index fce0814e979a4..476a3647207d6 100644 --- a/pandas/io/formats/latex.py +++ b/pandas/io/formats/latex.py @@ -361,7 +361,7 @@ def get_result(self) -> str: self.bottom_separator, self.env_end, ] - result = "\n".join([item for item in elements if item]) + result = "\n".join(item for item in elements if item) trailing_newline = "\n" result += trailing_newline return result @@ -530,13 +530,13 @@ def env_begin(self) -> str: f"\\begin{{longtable}}{self._position_macro}{{{self.column_format}}}" ) elements = [first_row, f"{self._caption_and_label()}"] - return "\n".join([item for item in elements if item]) + return "\n".join(item for item in elements if item) def _caption_and_label(self) -> str: if self.caption or self.label: double_backslash = "\\\\" elements = [f"{self._caption_macro}", f"{self._label_macro}"] - caption_and_label = "\n".join([item for item in elements if item]) + caption_and_label = "\n".join(item for item in elements if item) caption_and_label += double_backslash return caption_and_label else: @@ -614,7 +614,7 @@ def env_begin(self) -> str: f"{self._label_macro}", f"\\begin{{tabular}}{{{self.column_format}}}", ] - return "\n".join([item for item in elements if item]) + return "\n".join(item for item in elements if item) @property def bottom_separator(self) -> str: diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 56e34d9500f31..73924631aea5c 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -21,6 +21,7 @@ from pandas._typing import ( Axis, + FilePathOrBuffer, FrameOrSeries, FrameOrSeriesUnion, IndexLabel, @@ -30,6 +31,7 @@ from pandas.util._decorators import doc import pandas as pd +from pandas import RangeIndex from pandas.api.types import is_list_like from pandas.core import generic import pandas.core.common as com @@ -39,6 +41,8 @@ ) from pandas.core.generic import NDFrame +from pandas.io.formats.format import save_to_buffer + jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.") from pandas.io.formats.style_render import ( @@ -70,7 +74,7 @@ def _mpl(func: Callable): class Styler(StylerRenderer): - """ + r""" Helps style a DataFrame or Series according to the data with HTML and CSS. Parameters @@ -115,9 +119,12 @@ class Styler(StylerRenderer): .. versionadded:: 1.3.0 - escape : bool, default False - Replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` in cell display - strings with HTML-safe sequences. + escape : str, optional + Use 'html' to replace the characters ``&``, ``<``, ``>``, ``'``, and ``"`` + in cell display string with HTML-safe sequences. + Use 'latex' to replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, + ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with + LaTeX-safe sequences. ... versionadded:: 1.3.0 @@ -175,7 +182,7 @@ def __init__( uuid_len: int = 5, decimal: str = ".", thousands: str | None = None, - escape: bool = False, + escape: str | None = None, ): super().__init__( data=data, @@ -403,6 +410,406 @@ def to_excel( engine=engine, ) + def to_latex( + self, + buf: FilePathOrBuffer[str] | None = None, + *, + column_format: str | None = None, + position: str | None = None, + position_float: str | None = None, + hrules: bool = False, + label: str | None = None, + caption: str | None = None, + sparse_index: bool | None = None, + sparse_columns: bool | None = None, + multirow_align: str = "c", + multicol_align: str = "r", + siunitx: bool = False, + encoding: str | None = None, + ): + r""" + Write Styler to a file, buffer or string in LaTeX format. + + .. versionadded:: 1.3.0 + + Parameters + ---------- + buf : str, Path, or StringIO-like, optional, default None + Buffer to write to. If ``None``, the output is returned as a string. + column_format : str, optional + The LaTeX column specification placed in location: + + \\begin{tabular}{} + + Defaults to 'l' for index and + non-numeric data columns, and, for numeric data columns, + to 'r' by default, or 'S' if ``siunitx`` is ``True``. + position : str, optional + The LaTeX positional argument (e.g. 'h!') for tables, placed in location: + + \\begin{table}[] + position_float : {"centering", "raggedleft", "raggedright"}, optional + The LaTeX float command placed in location: + + \\begin{table}[] + + \\ + hrules : bool, default False + Set to `True` to add \\toprule, \\midrule and \\bottomrule from the + {booktabs} LaTeX package. + label : str, optional + The LaTeX label included as: \\label{