diff --git a/.circleci/config.yml b/.circleci/config.yml index e704c37df3e45..dfaade1d69c75 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -6,7 +6,7 @@ jobs: image: ubuntu-2004:2022.04.1 resource_class: arm.large environment: - ENV_FILE: ci/deps/circle-38-arm64.yaml + ENV_FILE: ci/deps/circle-310-arm64.yaml PYTEST_WORKERS: auto PATTERN: "not single_cpu and not slow and not network and not clipboard and not arm_slow and not db" PYTEST_TARGET: "pandas" @@ -26,7 +26,6 @@ jobs: image: ubuntu-2004:2022.04.1 resource_class: arm.large environment: - ENV_FILE: ci/deps/circle-38-arm64.yaml TRIGGER_SOURCE: << pipeline.trigger_source >> steps: - checkout @@ -92,4 +91,4 @@ workflows: only: /^v.*/ matrix: parameters: - cibw-build: ["cp38-manylinux_aarch64", "cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"] + cibw-build: ["cp39-manylinux_aarch64", "cp310-manylinux_aarch64", "cp311-manylinux_aarch64"] diff --git a/.circleci/setup_env.sh b/.circleci/setup_env.sh index 7d13e202e951e..e41650870bd70 100755 --- a/.circleci/setup_env.sh +++ b/.circleci/setup_env.sh @@ -54,10 +54,7 @@ if pip list | grep -q ^pandas; then pip uninstall -y pandas || true fi -echo "Build extensions" -python setup.py build_ext -q -j4 - echo "Install pandas" -python -m pip install --no-build-isolation --no-use-pep517 -e . +python -m pip install --no-build-isolation -ve . echo "done" diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000000000..3783632bdbe84 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,22 @@ +# github +.github/ @mroeschke + +# ci +ci/ @mroeschke + +# web +web/ @datapythonista + +# docs +doc/cheatsheet @Dr-Irv + +# pandas +pandas/_libs/ @WillAyd +pandas/_libs/tslibs/* @MarcoGorelli +pandas/_typing.py @Dr-Irv +pandas/core/groupby/* @rhshadrach +pandas/core/tools/datetimes.py @MarcoGorelli +pandas/io/excel/* @rhshadrach +pandas/io/formats/style.py @attack68 +pandas/io/formats/style_render.py @attack68 +pandas/io/formats/templates @attack68 diff --git a/.github/actions/build_pandas/action.yml b/.github/actions/build_pandas/action.yml index 11601564c5d79..2d6b0aada4abd 100644 --- a/.github/actions/build_pandas/action.yml +++ b/.github/actions/build_pandas/action.yml @@ -1,5 +1,9 @@ name: Build pandas description: Rebuilds the C extensions and installs pandas +inputs: + editable: + description: Whether to build pandas in editable mode (default true) + default: true runs: using: composite steps: @@ -10,11 +14,18 @@ runs: micromamba list shell: bash -el {0} + - name: Uninstall existing Pandas installation + run: | + if pip list | grep -q ^pandas; then + pip uninstall -y pandas || true + fi + shell: bash -el {0} + - name: Build Pandas run: | - python setup.py build_ext -j $N_JOBS - python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index + if [[ ${{ inputs.editable }} == "true" ]]; then + pip install -e . --no-build-isolation -v + else + pip install . --no-build-isolation -v + fi shell: bash -el {0} - env: - # https://docs.github.com/en/actions/using-github-hosted-runners/about-github-hosted-runners#supported-runners-and-hardware-resources - N_JOBS: ${{ runner.os == 'macOS' && 3 || 2 }} diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index 2a7601f196ec4..fd7c3587f2254 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -7,7 +7,7 @@ runs: shell: bash -el {0} - name: Publish test results - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: Test results path: test-data.xml @@ -19,7 +19,7 @@ runs: if: failure() - name: Upload coverage to Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v3 with: flags: unittests name: codecov-pandas diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 329dc24d466b4..84e81b9a9297f 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -3,23 +3,14 @@ inputs: environment-file: description: Conda environment file to use. default: environment.yml - environment-name: - description: Name to use for the Conda environment - default: test - extra-specs: - description: Extra packages to install - required: false runs: using: composite steps: - name: Install ${{ inputs.environment-file }} - uses: mamba-org/provision-with-micromamba@v12 + uses: mamba-org/setup-micromamba@v1 with: environment-file: ${{ inputs.environment-file }} - environment-name: ${{ inputs.environment-name }} - extra-specs: ${{ inputs.extra-specs }} - channels: conda-forge - channel-priority: 'strict' + environment-name: test condarc-file: ci/condarc.yml - cache-env: true + cache-environment: true cache-downloads: true diff --git a/.github/workflows/32-bit-linux.yml b/.github/workflows/32-bit-linux.yml deleted file mode 100644 index 95d0d78c7585b..0000000000000 --- a/.github/workflows/32-bit-linux.yml +++ /dev/null @@ -1,58 +0,0 @@ -name: 32 Bit Linux - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - paths-ignore: - - "doc/**" - -permissions: - contents: read - -jobs: - pytest: - runs-on: ubuntu-22.04 - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Run 32-bit manylinux2014 Docker Build / Tests - run: | - # Without this (line 34), versioneer will not be able to determine the pandas version. - # This is because of a security update to git that blocks it from reading the config folder if - # it is not owned by the current user. We hit this since the "mounted" folder is not hit by the - # Docker container. - # xref https://github.com/pypa/manylinux/issues/1309 - docker pull quay.io/pypa/manylinux2014_i686 - docker run --platform linux/386 -v $(pwd):/pandas quay.io/pypa/manylinux2014_i686 \ - /bin/bash -xc "cd pandas && \ - git config --global --add safe.directory /pandas && \ - /opt/python/cp38-cp38/bin/python -m venv ~/virtualenvs/pandas-dev && \ - . ~/virtualenvs/pandas-dev/bin/activate && \ - python -m pip install --no-deps -U pip wheel 'setuptools<60.0.0' && \ - python -m pip install versioneer[toml] && \ - python -m pip install cython numpy python-dateutil pytz pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 && \ - python setup.py build_ext -q -j$(nproc) && \ - python -m pip install --no-build-isolation --no-use-pep517 -e . && \ - python -m pip list && \ - export PANDAS_CI=1 && \ - pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml" - - - name: Publish test results for Python 3.8-32 bit full Linux - uses: actions/upload-artifact@v3 - with: - name: Test results - path: test-data.xml - if: failure() - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit - cancel-in-progress: true diff --git a/.github/workflows/cache-cleanup-weekly.yml b/.github/workflows/cache-cleanup-weekly.yml new file mode 100644 index 0000000000000..225503f2894f8 --- /dev/null +++ b/.github/workflows/cache-cleanup-weekly.yml @@ -0,0 +1,29 @@ +name: Purge caches once a week +on: + schedule: + # 4:10 UTC on Sunday + - cron: "10 4 * * 0" + +jobs: + cleanup: + runs-on: ubuntu-latest + steps: + - name: Clean Cache + run: | + gh extension install actions/gh-actions-cache + + REPO=${{ github.repository }} + + echo "Fetching list of cache key" + allCaches=$(gh actions-cache list -L 100 -R $REPO | cut -f 1 ) + + ## Setting this to not fail the workflow while deleting cache keys. + set +e + echo "Deleting caches..." + for cacheKey in $allCaches + do + gh actions-cache delete $cacheKey -R $REPO --confirm + done + echo "Done" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/cache-cleanup.yml b/.github/workflows/cache-cleanup.yml new file mode 100644 index 0000000000000..099974141c1d1 --- /dev/null +++ b/.github/workflows/cache-cleanup.yml @@ -0,0 +1,30 @@ +name: Clean closed branch caches +on: + pull_request: + types: + - closed + +jobs: + cleanup: + runs-on: ubuntu-latest + steps: + - name: Clean Cache + run: | + gh extension install actions/gh-actions-cache + + REPO=${{ github.repository }} + BRANCH="refs/pull/${{ github.event.pull_request.number }}/merge" + + echo "Fetching list of cache key" + cacheKeysForPR=$(gh actions-cache list -R $REPO -B $BRANCH | cut -f 1 ) + + ## Setting this to not fail the workflow while deleting cache keys. + set +e + echo "Deleting caches..." + for cacheKey in $cacheKeysForPR + do + gh actions-cache delete $cacheKey -R $REPO -B $BRANCH --confirm + done + echo "Done" + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index d6d43a8bfc13b..f6c35decfd30b 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -63,8 +63,25 @@ jobs: - name: Build Pandas id: build uses: ./.github/actions/build_pandas + with: + editable: false # The following checks are independent of each other and should still be run if one fails + + # TODO: The doctests have to be run first right now, since the Cython doctests only work + # with pandas installed in non-editable mode + # This can be removed once pytest-cython doesn't require C extensions to be installed inplace + - name: Run doctests + run: cd ci && ./code_checks.sh doctests + if: ${{ steps.build.outcome == 'success' && always() }} + + - name: Install pandas in editable mode + id: build-editable + if: ${{ steps.build.outcome == 'success' && always() }} + uses: ./.github/actions/build_pandas + with: + editable: true + - name: Check for no warnings when building single-page docs run: ci/code_checks.sh single-docs if: ${{ steps.build.outcome == 'success' && always() }} @@ -73,10 +90,6 @@ jobs: run: ci/code_checks.sh code if: ${{ steps.build.outcome == 'success' && always() }} - - name: Run doctests - run: ci/code_checks.sh doctests - if: ${{ steps.build.outcome == 'success' && always() }} - - name: Run docstring validation run: ci/code_checks.sh docstrings if: ${{ steps.build.outcome == 'success' && always() }} @@ -179,7 +192,7 @@ jobs: id: setup_python uses: actions/setup-python@v4 with: - python-version: '3.8' + python-version: '3.10' cache: 'pip' cache-dependency-path: 'requirements-dev.txt' diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 23609f692df7c..8715c5306a3b0 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -18,6 +18,7 @@ jobs: actions: read contents: read security-events: write + if: github.repository_owner == 'pandas-dev' strategy: fail-fast: false diff --git a/.github/workflows/deprecation-tracking-bot.yml b/.github/workflows/deprecation-tracking-bot.yml index c0d871ed54ed6..b3f9bcd840c68 100644 --- a/.github/workflows/deprecation-tracking-bot.yml +++ b/.github/workflows/deprecation-tracking-bot.yml @@ -1,11 +1,13 @@ +# This bot updates the issue with number DEPRECATION_TRACKER_ISSUE +# with the PR number that issued the deprecation. + +# It runs on commits to main, and will trigger if the PR linked to a merged commit has the "Deprecate" label name: Deprecations Bot on: - pull_request: + push: branches: - main - types: - [closed] permissions: @@ -15,17 +17,49 @@ jobs: deprecation_update: permissions: issues: write - if: >- - contains(github.event.pull_request.labels.*.name, 'Deprecate') && github.event.pull_request.merged == true runs-on: ubuntu-22.04 env: DEPRECATION_TRACKER_ISSUE: 50578 steps: - - name: Checkout - run: | - echo "Adding deprecation PR number to deprecation tracking issue" - export PR=${{ github.event.pull_request.number }} - BODY=$(curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" https://api.github.com/repos/${{ github.repository }}/issues/${DEPRECATION_TRACKER_ISSUE} | - python3 -c "import sys, json, os; x = {'body': json.load(sys.stdin)['body']}; pr = os.environ['PR']; x['body'] += f'\n- [ ] #{pr}'; print(json.dumps(x))") - echo ${BODY} - curl -H "Authorization: token ${{ secrets.GITHUB_TOKEN }}" -X PATCH -d "${BODY}" https://api.github.com/repos/${{ github.repository }}/issues/${DEPRECATION_TRACKER_ISSUE} + - uses: actions/github-script@v6 + id: update-deprecation-issue + with: + script: | + body = await github.rest.issues.get({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: ${{ env.DEPRECATION_TRACKER_ISSUE }}, + }) + body = body["data"]["body"]; + linkedPRs = await github.rest.repos.listPullRequestsAssociatedWithCommit({ + owner: context.repo.owner, + repo: context.repo.repo, + commit_sha: '${{ github.sha }}' + }) + linkedPRs = linkedPRs["data"]; + console.log(linkedPRs); + if (linkedPRs.length > 0) { + console.log("Found linked PR"); + linkedPR = linkedPRs[0] + isDeprecation = false + for (label of linkedPR["labels"]) { + if (label["name"] == "Deprecate") { + isDeprecation = true; + break; + } + } + + PR_NUMBER = linkedPR["number"]; + + body += ("\n- [ ] #" + PR_NUMBER); + if (isDeprecation) { + console.log("PR is a deprecation PR. Printing new body of issue"); + console.log(body); + github.rest.issues.update({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: ${{ env.DEPRECATION_TRACKER_ISSUE }}, + body: body + }) + } + } diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml deleted file mode 100644 index 7ed5f5b90b959..0000000000000 --- a/.github/workflows/macos-windows.yml +++ /dev/null @@ -1,60 +0,0 @@ -name: Windows-macOS - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - paths-ignore: - - "doc/**" - - "web/**" - -env: - PANDAS_CI: 1 - PYTEST_TARGET: pandas - PATTERN: "not slow and not db and not network and not single_cpu" - -permissions: - contents: read - -jobs: - pytest: - defaults: - run: - shell: bash -el {0} - timeout-minutes: 180 - strategy: - matrix: - os: [macos-latest, windows-latest] - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] - fail-fast: false - runs-on: ${{ matrix.os }} - name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} - cancel-in-progress: true - env: - # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors - PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: ci/deps/${{ matrix.env_file }} - - - name: Build Pandas - uses: ./.github/actions/build_pandas - - - name: Test - uses: ./.github/actions/run-tests diff --git a/.github/workflows/package-checks.yml b/.github/workflows/package-checks.yml index fa1b5e5d4fba3..6ff3d3b0a3b98 100644 --- a/.github/workflows/package-checks.yml +++ b/.github/workflows/package-checks.yml @@ -14,6 +14,10 @@ on: permissions: contents: read +defaults: + run: + shell: bash -el {0} + jobs: pip: if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} @@ -38,15 +42,40 @@ jobs: id: setup_python uses: actions/setup-python@v4 with: - python-version: '3.8' - - - name: Install required dependencies - run: | - python -m pip install --upgrade pip setuptools wheel python-dateutil pytz numpy cython - python -m pip install versioneer[toml] - shell: bash -el {0} + python-version: '3.10' - name: Pip install with extra run: | - python -m pip install -e .[${{ matrix.extra }}] --no-build-isolation + python -m pip install .[${{ matrix.extra }}] -v shell: bash -el {0} + conda_forge_recipe: + if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} + runs-on: ubuntu-22.04 + strategy: + matrix: + python-version: ['3.9', '3.10', '3.11'] + fail-fast: false + name: Test Conda Forge Recipe - Python ${{ matrix.python-version }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-conda-forge-recipe-${{ matrix.python-version }} + cancel-in-progress: true + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: mamba-org/setup-micromamba@v1 + with: + environment-name: recipe-test + create-args: >- + python=${{ matrix.python-version }} + boa + conda-verify + cache-downloads: true + cache-environment: true + + - name: Build conda package + run: conda mambabuild ci --no-anaconda-upload --verify --strict-verify --output --output-folder . diff --git a/.github/workflows/python-dev.yml b/.github/workflows/python-dev.yml deleted file mode 100644 index 8ac8a1a1fcebf..0000000000000 --- a/.github/workflows/python-dev.yml +++ /dev/null @@ -1,95 +0,0 @@ -# This workflow may or may not run depending on the state of the next -# unreleased Python version. DO NOT DELETE IT. -# -# In general, this file will remain frozen(present, but not running) until: -# - The next unreleased Python version has released beta 1 -# - This version should be available on GitHub Actions. -# - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil) -# support that unreleased Python version. -# To unfreeze, comment out the ``if: false`` condition, and make sure you update -# the name of the workflow and Python version in actions/setup-python to: '3.12-dev' -# -# After it has been unfrozen, this file should remain unfrozen(present, and running) until: -# - The next Python version has been officially released. -# OR -# - Most/All of our optional dependencies support Python 3.11 AND -# - The next Python version has released a rc(we are guaranteed a stable ABI). -# To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs -# to the corresponding posix/windows-macos/sdist etc. workflows. -# Feel free to modify this comment as necessary. - -name: Python Dev - -on: - push: - branches: - - main - - 2.0.x - - None - pull_request: - branches: - - main - - 2.0.x - - None - paths-ignore: - - "doc/**" - - "web/**" - -env: - PYTEST_WORKERS: "auto" - PANDAS_CI: 1 - PATTERN: "not slow and not network and not clipboard and not single_cpu" - COVERAGE: true - PYTEST_TARGET: pandas - -permissions: - contents: read - -jobs: - build: - if: false # Uncomment this to freeze the workflow, comment it to unfreeze - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: [ubuntu-22.04, macOS-latest, windows-latest] - - name: actions-311-dev - timeout-minutes: 120 - - concurrency: - #https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev - cancel-in-progress: true - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Python Dev Version - uses: actions/setup-python@v4 - with: - python-version: '3.11-dev' - - - name: Install dependencies - run: | - python --version - python -m pip install --upgrade pip setuptools wheel - python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy - python -m pip install git+https://github.com/nedbat/coveragepy.git - python -m pip install versioneer[toml] - python -m pip install python-dateutil pytz cython hypothesis>=6.46.1 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 - python -m pip list - - - name: Build Pandas - run: | - python setup.py build_ext -q -j4 - python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index - - - name: Build Version - run: | - python -c "import pandas; pandas.show_versions();" - - - name: Test - uses: ./.github/actions/run-tests diff --git a/.github/workflows/sdist.yml b/.github/workflows/sdist.yml deleted file mode 100644 index 460369f45e900..0000000000000 --- a/.github/workflows/sdist.yml +++ /dev/null @@ -1,96 +0,0 @@ -name: sdist - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - types: [labeled, opened, synchronize, reopened] - paths-ignore: - - "doc/**" - - "web/**" - -permissions: - contents: read - -jobs: - build: - if: ${{ github.event.label.name == 'Build' || contains(github.event.pull_request.labels.*.name, 'Build') || github.event_name == 'push'}} - runs-on: ubuntu-22.04 - timeout-minutes: 60 - defaults: - run: - shell: bash -el {0} - - strategy: - fail-fast: false - matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{matrix.python-version}}-sdist - cancel-in-progress: true - - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - - - name: Install dependencies - run: | - python -m pip install --upgrade pip setuptools wheel - python -m pip install versioneer[toml] - - # GH 39416 - pip install numpy - - - name: Build pandas sdist - run: | - pip list - python setup.py sdist --formats=gztar - - - name: Upload sdist artifact - uses: actions/upload-artifact@v3 - with: - name: ${{matrix.python-version}}-sdist.gz - path: dist/*.gz - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: false - environment-name: pandas-sdist - extra-specs: | - python =${{ matrix.python-version }} - - - name: Install pandas from sdist - run: | - pip list - python -m pip install dist/*.gz - - - name: Force oldest supported NumPy - run: | - case "${{matrix.python-version}}" in - 3.8) - pip install numpy==1.21.6 ;; - 3.9) - pip install numpy==1.21.6 ;; - 3.10) - pip install numpy==1.21.6 ;; - 3.11) - pip install numpy==1.23.2 ;; - esac - - - name: Import pandas - run: | - cd .. - python -c "import pandas; pandas.show_versions();" diff --git a/.github/workflows/stale-pr.yml b/.github/workflows/stale-pr.yml index 8ead259f95ed3..11b81d11f7876 100644 --- a/.github/workflows/stale-pr.yml +++ b/.github/workflows/stale-pr.yml @@ -11,6 +11,7 @@ jobs: stale: permissions: pull-requests: write + if: github.repository_owner == 'pandas-dev' runs-on: ubuntu-22.04 steps: - uses: actions/stale@v8 diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml deleted file mode 100644 index 7390c349ff565..0000000000000 --- a/.github/workflows/ubuntu.yml +++ /dev/null @@ -1,168 +0,0 @@ -name: Ubuntu - -on: - push: - branches: - - main - - 2.0.x - pull_request: - branches: - - main - - 2.0.x - paths-ignore: - - "doc/**" - - "web/**" - -permissions: - contents: read - -jobs: - pytest: - runs-on: ubuntu-22.04 - defaults: - run: - shell: bash -el {0} - timeout-minutes: 180 - strategy: - matrix: - env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] - # Prevent the include jobs from overriding other jobs - pattern: [""] - include: - - name: "Downstream Compat" - env_file: actions-38-downstream_compat.yaml - pattern: "not slow and not network and not single_cpu" - pytest_target: "pandas/tests/test_downstream.py" - - name: "Minimum Versions" - env_file: actions-38-minimum_versions.yaml - pattern: "not slow and not network and not single_cpu" - - name: "Locale: it_IT" - env_file: actions-38.yaml - pattern: "not slow and not network and not single_cpu" - extra_apt: "language-pack-it" - # Use the utf8 version as the default, it has no bad side-effect. - lang: "it_IT.utf8" - lc_all: "it_IT.utf8" - # Also install it_IT (its encoding is ISO8859-1) but do not activate it. - # It will be temporarily activated during tests with locale.setlocale - extra_loc: "it_IT" - - name: "Locale: zh_CN" - env_file: actions-38.yaml - pattern: "not slow and not network and not single_cpu" - extra_apt: "language-pack-zh-hans" - # Use the utf8 version as the default, it has no bad side-effect. - lang: "zh_CN.utf8" - lc_all: "zh_CN.utf8" - # Also install zh_CN (its encoding is gb2312) but do not activate it. - # It will be temporarily activated during tests with locale.setlocale - extra_loc: "zh_CN" - - name: "Copy-on-Write" - env_file: actions-310.yaml - pattern: "not slow and not network and not single_cpu" - pandas_copy_on_write: "1" - - name: "Pypy" - env_file: actions-pypy-38.yaml - pattern: "not slow and not network and not single_cpu" - test_args: "--max-worker-restart 0" - - name: "Numpy Dev" - env_file: actions-310-numpydev.yaml - pattern: "not slow and not network and not single_cpu" - test_args: "-W error::DeprecationWarning -W error::FutureWarning" - # TODO(cython3): Re-enable once next-beta(after beta 1) comes out - # There are some warnings failing the build with -werror - pandas_ci: "0" - - name: "Pyarrow Nightly" - env_file: actions-311-pyarrownightly.yaml - pattern: "not slow and not network and not single_cpu" - fail-fast: false - name: ${{ matrix.name || matrix.env_file }} - env: - ENV_FILE: ci/deps/${{ matrix.env_file }} - PATTERN: ${{ matrix.pattern }} - EXTRA_APT: ${{ matrix.extra_apt || '' }} - LANG: ${{ matrix.lang || '' }} - LC_ALL: ${{ matrix.lc_all || '' }} - PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} - PANDAS_CI: ${{ matrix.pandas_ci || '1' }} - TEST_ARGS: ${{ matrix.test_args || '' }} - PYTEST_WORKERS: 'auto' - PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} - concurrency: - # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }} - cancel-in-progress: true - - services: - mysql: - image: mysql - env: - MYSQL_ALLOW_EMPTY_PASSWORD: yes - MYSQL_DATABASE: pandas - options: >- - --health-cmd "mysqladmin ping" - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 3306:3306 - - postgres: - image: postgres - env: - POSTGRES_USER: postgres - POSTGRES_PASSWORD: postgres - POSTGRES_DB: pandas - options: >- - --health-cmd pg_isready - --health-interval 10s - --health-timeout 5s - --health-retries 5 - ports: - - 5432:5432 - - moto: - image: motoserver/moto:4.1.4 - env: - AWS_ACCESS_KEY_ID: foobar_key - AWS_SECRET_ACCESS_KEY: foobar_secret - ports: - - 5000:5000 - - steps: - - name: Checkout - uses: actions/checkout@v3 - with: - fetch-depth: 0 - - - name: Extra installs - # xsel for clipboard tests - run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} - - - name: Generate extra locales - # These extra locales will be available for locale.setlocale() calls in tests - run: | - sudo locale-gen ${{ matrix.extra_loc }} - if: ${{ matrix.extra_loc }} - - - name: Set up Conda - uses: ./.github/actions/setup-conda - with: - environment-file: ${{ env.ENV_FILE }} - - - name: Build Pandas - id: build - uses: ./.github/actions/build_pandas - - - name: Test (not single_cpu) - uses: ./.github/actions/run-tests - if: ${{ matrix.name != 'Pypy' }} - env: - # Set pattern to not single_cpu if not already set - PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} - - - name: Test (single_cpu) - uses: ./.github/actions/run-tests - env: - PATTERN: 'single_cpu' - PYTEST_WORKERS: 1 - if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml new file mode 100644 index 0000000000000..600986d3297a9 --- /dev/null +++ b/.github/workflows/unit-tests.yml @@ -0,0 +1,363 @@ +name: Unit Tests + +on: + push: + branches: + - main + - 2.0.x + pull_request: + branches: + - main + - 2.0.x + paths-ignore: + - "doc/**" + - "web/**" + +permissions: + contents: read + +defaults: + run: + shell: bash -el {0} + +jobs: + ubuntu: + runs-on: ubuntu-22.04 + timeout-minutes: 180 + strategy: + matrix: + env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml] + # Prevent the include jobs from overriding other jobs + pattern: [""] + include: + - name: "Downstream Compat" + env_file: actions-311-downstream_compat.yaml + pattern: "not slow and not network and not single_cpu" + pytest_target: "pandas/tests/test_downstream.py" + - name: "Minimum Versions" + env_file: actions-39-minimum_versions.yaml + pattern: "not slow and not network and not single_cpu" + - name: "Locale: it_IT" + env_file: actions-311.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-it" + # Use the utf8 version as the default, it has no bad side-effect. + lang: "it_IT.utf8" + lc_all: "it_IT.utf8" + # Also install it_IT (its encoding is ISO8859-1) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "it_IT" + - name: "Locale: zh_CN" + env_file: actions-311.yaml + pattern: "not slow and not network and not single_cpu" + extra_apt: "language-pack-zh-hans" + # Use the utf8 version as the default, it has no bad side-effect. + lang: "zh_CN.utf8" + lc_all: "zh_CN.utf8" + # Also install zh_CN (its encoding is gb2312) but do not activate it. + # It will be temporarily activated during tests with locale.setlocale + extra_loc: "zh_CN" + - name: "Copy-on-Write 3.9" + env_file: actions-39.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "1" + - name: "Copy-on-Write 3.10" + env_file: actions-310.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "1" + - name: "Copy-on-Write 3.11" + env_file: actions-311.yaml + pattern: "not slow and not network and not single_cpu" + pandas_copy_on_write: "1" + - name: "Pypy" + env_file: actions-pypy-39.yaml + pattern: "not slow and not network and not single_cpu" + test_args: "--max-worker-restart 0" + - name: "Numpy Dev" + env_file: actions-311-numpydev.yaml + pattern: "not slow and not network and not single_cpu" + test_args: "-W error::DeprecationWarning -W error::FutureWarning" + # TODO(cython3): Re-enable once next-beta(after beta 1) comes out + # There are some warnings failing the build with -werror + pandas_ci: "0" + - name: "Pyarrow Nightly" + env_file: actions-311-pyarrownightly.yaml + pattern: "not slow and not network and not single_cpu" + fail-fast: false + name: ${{ matrix.name || format('ubuntu-latest {0}', matrix.env_file) }} + env: + ENV_FILE: ci/deps/${{ matrix.env_file }} + PATTERN: ${{ matrix.pattern }} + EXTRA_APT: ${{ matrix.extra_apt || '' }} + LANG: ${{ matrix.lang || 'C.UTF-8' }} + LC_ALL: ${{ matrix.lc_all || '' }} + PANDAS_COPY_ON_WRITE: ${{ matrix.pandas_copy_on_write || '0' }} + PANDAS_CI: ${{ matrix.pandas_ci || '1' }} + TEST_ARGS: ${{ matrix.test_args || '' }} + PYTEST_WORKERS: 'auto' + PYTEST_TARGET: ${{ matrix.pytest_target || 'pandas' }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }} + cancel-in-progress: true + + services: + mysql: + image: mysql + env: + MYSQL_ALLOW_EMPTY_PASSWORD: yes + MYSQL_DATABASE: pandas + options: >- + --health-cmd "mysqladmin ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 3306:3306 + + postgres: + image: postgres + env: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: pandas + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 5432:5432 + + moto: + image: motoserver/moto:4.1.12 + env: + AWS_ACCESS_KEY_ID: foobar_key + AWS_SECRET_ACCESS_KEY: foobar_secret + ports: + - 5000:5000 + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Extra installs + # xsel for clipboard tests + run: sudo apt-get update && sudo apt-get install -y xsel ${{ env.EXTRA_APT }} + + - name: Generate extra locales + # These extra locales will be available for locale.setlocale() calls in tests + run: | + sudo locale-gen ${{ matrix.extra_loc }} + if: ${{ matrix.extra_loc }} + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ${{ env.ENV_FILE }} + + - name: Build Pandas + id: build + uses: ./.github/actions/build_pandas + + - name: Test (not single_cpu) + uses: ./.github/actions/run-tests + if: ${{ matrix.name != 'Pypy' }} + env: + # Set pattern to not single_cpu if not already set + PATTERN: ${{ env.PATTERN == '' && 'not single_cpu' || matrix.pattern }} + + - name: Test (single_cpu) + uses: ./.github/actions/run-tests + env: + PATTERN: 'single_cpu' + PYTEST_WORKERS: 1 + if: ${{ matrix.pattern == '' && (always() && steps.build.outcome == 'success')}} + + macos-windows: + timeout-minutes: 180 + strategy: + matrix: + os: [macos-latest, windows-latest] + env_file: [actions-39.yaml, actions-310.yaml, actions-311.yaml] + fail-fast: false + runs-on: ${{ matrix.os }} + name: ${{ format('{0} {1}', matrix.os, matrix.env_file) }} + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.os }} + cancel-in-progress: true + env: + PANDAS_CI: 1 + PYTEST_TARGET: pandas + PATTERN: "not slow and not db and not network and not single_cpu" + # GH 47443: PYTEST_WORKERS > 1 crashes Windows builds with memory related errors + PYTEST_WORKERS: ${{ matrix.os == 'macos-latest' && 'auto' || '1' }} + + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Conda + uses: ./.github/actions/setup-conda + with: + environment-file: ci/deps/${{ matrix.env_file }} + + - name: Build Pandas + uses: ./.github/actions/build_pandas + + - name: Test + uses: ./.github/actions/run-tests + + Linux-32-bit: + runs-on: ubuntu-22.04 + container: + image: quay.io/pypa/manylinux2014_i686 + options: --platform linux/386 + steps: + - name: Checkout pandas Repo + # actions/checkout does not work since it requires node + run: | + git config --global --add safe.directory $PWD + + if [ $GITHUB_EVENT_NAME != pull_request ]; then + git clone --recursive --branch=$GITHUB_REF_NAME https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git reset --hard $GITHUB_SHA + else + git clone --recursive https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git fetch origin $GITHUB_REF:my_ref_name + git checkout $GITHUB_BASE_REF + git -c user.email="you@example.com" merge --no-commit my_ref_name + fi + - name: Build environment and Run Tests + run: | + /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install -U pip wheel setuptools meson[ninja]==1.0.1 meson-python==0.13.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir --no-build-isolation -e . + python -m pip list --no-cache-dir + export PANDAS_CI=1 + python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-32bit + cancel-in-progress: true + + Linux-Musl: + runs-on: ubuntu-22.04 + container: + image: quay.io/pypa/musllinux_1_1_x86_64 + steps: + - name: Checkout pandas Repo + # actions/checkout does not work since it requires node + run: | + git config --global --add safe.directory $PWD + + if [ $GITHUB_EVENT_NAME != pull_request ]; then + git clone --recursive --branch=$GITHUB_REF_NAME https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git reset --hard $GITHUB_SHA + else + git clone --recursive https://github.com/${GITHUB_REPOSITORY}.git $GITHUB_WORKSPACE + git fetch origin $GITHUB_REF:my_ref_name + git checkout $GITHUB_BASE_REF + git -c user.email="you@example.com" merge --no-commit my_ref_name + fi + - name: Configure System Packages + run: | + apk update + apk add musl-locales + - name: Build environment + run: | + /opt/python/cp311-cp311/bin/python -m venv ~/virtualenvs/pandas-dev + . ~/virtualenvs/pandas-dev/bin/activate + python -m pip install -U pip wheel setuptools meson-python==0.13.1 meson[ninja]==1.0.1 + python -m pip install --no-cache-dir versioneer[toml] cython numpy python-dateutil pytz pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 hypothesis>=6.46.1 + python -m pip install --no-cache-dir --no-build-isolation -e . + python -m pip list --no-cache-dir + + - name: Run Tests + run: | + . ~/virtualenvs/pandas-dev/bin/activate + export PANDAS_CI=1 + python -m pytest -m 'not slow and not network and not clipboard and not single_cpu' pandas --junitxml=test-data.xml + concurrency: + # https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-musl + cancel-in-progress: true + + python-dev: + # This job may or may not run depending on the state of the next + # unreleased Python version. DO NOT DELETE IT. + # + # In general, this will remain frozen(present, but not running) until: + # - The next unreleased Python version has released beta 1 + # - This version should be available on GitHub Actions. + # - Our required build/runtime dependencies(numpy, pytz, Cython, python-dateutil) + # support that unreleased Python version. + # To unfreeze, comment out the ``if: false`` condition, and make sure you update + # the name of the workflow and Python version in actions/setup-python ``python-version:`` + # + # After it has been unfrozen, this file should remain unfrozen(present, and running) until: + # - The next Python version has been officially released. + # OR + # - Most/All of our optional dependencies support the next Python version AND + # - The next Python version has released a rc(we are guaranteed a stable ABI). + # To freeze this file, uncomment out the ``if: false`` condition, and migrate the jobs + # to the corresponding posix/windows-macos/sdist etc. workflows. + # Feel free to modify this comment as necessary. + if: false # Uncomment this to freeze the workflow, comment it to unfreeze + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-22.04, macOS-latest, windows-latest] + + timeout-minutes: 180 + + concurrency: + #https://github.community/t/concurrecy-not-work-for-push/183068/7 + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.os }}-${{ matrix.pytest_target }}-dev + cancel-in-progress: true + + env: + PYTEST_WORKERS: "auto" + PANDAS_CI: 1 + PATTERN: "not slow and not network and not clipboard and not single_cpu" + COVERAGE: true + PYTEST_TARGET: pandas + + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python Dev Version + uses: actions/setup-python@v4 + with: + python-version: '3.11-dev' + + - name: Install dependencies + run: | + python --version + python -m pip install --upgrade pip setuptools wheel + python -m pip install --pre --extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple numpy + python -m pip install git+https://github.com/nedbat/coveragepy.git + python -m pip install versioneer[toml] + python -m pip install python-dateutil pytz cython hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-cov pytest-asyncio>=0.17 + python -m pip list + + - name: Build Pandas + run: | + python -m pip install -e . --no-build-isolation --no-index + + - name: Build Version + run: | + python -c "import pandas; pandas.show_versions();" + + - name: Test + uses: ./.github/actions/run-tests diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 593a98d64e46a..f1f9646054132 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -4,7 +4,7 @@ # In an attempt to save CI resources, wheel builds do # not run on each push but only weekly and for releases. # Wheel builds can be triggered from the Actions page -# (if you have the perms) on a commit to master. +# (if you have the permissions) on a commit to main. # # Alternatively, you can add labels to the pull request in order to trigger wheel # builds. @@ -14,16 +14,14 @@ name: Wheel builder on: schedule: - # ┌───────────── minute (0 - 59) - # │ ┌───────────── hour (0 - 23) - # │ │ ┌───────────── day of the month (1 - 31) - # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) - # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - # │ │ │ │ │ - - cron: "27 3 */1 * *" + # 3:27 UTC every day + - cron: "27 3 * * *" push: pull_request: - types: [labeled, opened, synchronize, reopened] + types: [labeled, opened, synchronize, reopened] + paths-ignore: + - "doc/**" + - "web/**" workflow_dispatch: concurrency: @@ -34,104 +32,68 @@ permissions: contents: read jobs: - build_wheels: - name: Build wheel for ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + build_sdist: + name: Build sdist if: >- - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository_owner == 'pandas-dev') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'Build')) || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) - runs-on: ${{ matrix.buildplat[0] }} - strategy: - # Ensure that a wheel builder finishes even if another fails - fail-fast: false - matrix: - # GitHub Actions doesn't support pairing matrix values together, let's improvise - # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026 - buildplat: - - [ubuntu-20.04, manylinux_x86_64] - - [macos-11, macosx_*] - - [windows-2019, win_amd64] - - [windows-2019, win32] - # TODO: support PyPy? - python: [["cp38", "3.8"], ["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]]# "pp38", "pp39"] + runs-on: ubuntu-22.04 env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} + outputs: + sdist_file: ${{ steps.save-path.outputs.sdist_name }} steps: - name: Checkout pandas uses: actions/checkout@v3 with: - submodules: true - # versioneer.py requires the latest tag to be reachable. Here we - # fetch the complete history to get access to the tags. - # A shallow clone can work when the following issue is resolved: - # https://github.com/actions/checkout/issues/338 fetch-depth: 0 - - name: Build wheels - uses: pypa/cibuildwheel@v2.12.1 - env: - CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} - - # Used to test(Windows-only) and push the built wheels - # You might need to use setup-python separately - # if the new Python-dev version - # is unavailable on conda-forge. - - uses: conda-incubator/setup-miniconda@v2 + - name: Set up Python + uses: actions/setup-python@v4 with: - auto-update-conda: true - python-version: ${{ matrix.python[1] }} - activate-environment: test - channels: conda-forge, anaconda - channel-priority: true - # mamba fails to solve, also we really don't need this since we're just installing python - # mamba-version: "*" - - - name: Test wheels (Windows 64-bit only) - if: ${{ matrix.buildplat[1] == 'win_amd64' }} - shell: cmd /C CALL {0} + python-version: '3.11' + + - name: Build sdist run: | - python ci/test_wheels.py wheelhouse + python -m pip install build + python -m build --sdist - uses: actions/upload-artifact@v3 with: - name: ${{ matrix.python[0] }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }} - path: ./wheelhouse/*.whl - + name: sdist + path: ./dist/* - - name: Install anaconda client - if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }} + - name: Output sdist name + id: save-path shell: bash -el {0} - run: conda install -q -y anaconda-client + run: echo "sdist_name=$(ls ./dist)" >> "$GITHUB_OUTPUT" - - - name: Upload wheels - if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }} - shell: bash -el {0} - env: - PANDAS_STAGING_UPLOAD_TOKEN: ${{ secrets.PANDAS_STAGING_UPLOAD_TOKEN }} - PANDAS_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.PANDAS_NIGHTLY_UPLOAD_TOKEN }} - run: | - source ci/upload_wheels.sh - set_upload_vars - # trigger an upload to - # https://anaconda.org/scipy-wheels-nightly/pandas - # for cron jobs or "Run workflow" (restricted to main branch). - # Tags will upload to - # https://anaconda.org/multibuild-wheels-staging/pandas - # The tokens were originally generated at anaconda.org - upload_wheels - build_sdist: - name: Build sdist + build_wheels: + needs: build_sdist + name: Build wheel for ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} if: >- - github.event_name == 'schedule' || + (github.event_name == 'schedule' && github.repository_owner == 'pandas-dev') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && contains(github.event.pull_request.labels.*.name, 'Build')) || (github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && ( ! endsWith(github.ref, 'dev0'))) - runs-on: ubuntu-22.04 + runs-on: ${{ matrix.buildplat[0] }} + strategy: + fail-fast: false + matrix: + # GitHub Actions doesn't support pairing matrix values together, let's improvise + # https://github.com/github/feedback/discussions/7835#discussioncomment-1769026 + buildplat: + - [ubuntu-22.04, manylinux_x86_64] + - [ubuntu-22.04, musllinux_x86_64] + - [macos-12, macosx_*] + - [windows-2022, win_amd64] + # TODO: support PyPy? + python: [["cp39", "3.9"], ["cp310", "3.10"], ["cp311", "3.11"]] env: IS_PUSH: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') }} IS_SCHEDULE_DISPATCH: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} @@ -139,66 +101,70 @@ jobs: - name: Checkout pandas uses: actions/checkout@v3 with: - submodules: true - # versioneer.py requires the latest tag to be reachable. Here we - # fetch the complete history to get access to the tags. - # A shallow clone can work when the following issue is resolved: - # https://github.com/actions/checkout/issues/338 fetch-depth: 0 - # Used to push the built sdist - - uses: conda-incubator/setup-miniconda@v2 + - name: Download sdist + uses: actions/download-artifact@v3 with: - auto-update-conda: true - # Really doesn't matter what version we upload with - # just the version we test with - python-version: '3.8' - channels: conda-forge - channel-priority: true - # mamba fails to solve, also we really don't need this since we're just installing python - # mamba-version: "*" + name: sdist + path: ./dist - - name: Build sdist - run: | - pip install build - python -m build --sdist - - name: Test the sdist + - name: Build wheels + uses: pypa/cibuildwheel@v2.13.1 + # TODO: Build wheels from sdist again + # There's some sort of weird race condition? + # within Github that makes the sdist be missing files + #with: + # package-dir: ./dist/${{ needs.build_sdist.outputs.sdist_file }} + env: + CIBW_BUILD: ${{ matrix.python[0] }}-${{ matrix.buildplat[1] }} + + - name: Set up Python + uses: mamba-org/setup-micromamba@v1 + with: + environment-name: wheel-env + create-args: >- + python=${{ matrix.python[1] }} + anaconda-client + wheel + cache-downloads: true + cache-environment: true + + - name: Validate wheel RECORD shell: bash -el {0} + run: for whl in $(ls wheelhouse); do wheel unpack wheelhouse/$whl -d /tmp; done + + # Testing on windowsservercore instead of GHA runner to fail on missing DLLs + - name: Test Windows Wheels + if: ${{ matrix.buildplat[1] == 'win_amd64' }} + shell: pwsh run: | - # TODO: Don't run test suite, and instead build wheels from sdist - # by splitting the wheel builders into a two stage job - # (1. Generate sdist 2. Build wheels from sdist) - # This tests the sdists, and saves some build time - python -m pip install dist/*.gz - pip install hypothesis>=6.46.1 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 - cd .. # Not a good idea to test within the src tree - python -c "import pandas; print(pandas.__version__); - pandas.test(extra_args=['-m not clipboard and not single_cpu and not slow and not network and not db', '-n 2']); - pandas.test(extra_args=['-m not clipboard and single_cpu and not slow and not network and not db'])" + $TST_CMD = @" + python -m pip install pytz six numpy python-dateutil tzdata>=2022.1 hypothesis>=6.46.1 pytest>=7.3.2 pytest-xdist>=2.2.0 pytest-asyncio>=0.17; + python -m pip install --find-links=pandas\wheelhouse --no-index pandas; + python -c `'import pandas as pd; pd.test()`'; + "@ + docker pull python:${{ matrix.python[1] }}-windowsservercore + docker run --env PANDAS_CI='1' -v ${PWD}:C:\pandas python:${{ matrix.python[1] }}-windowsservercore powershell -Command $TST_CMD + - uses: actions/upload-artifact@v3 with: - name: sdist - path: ./dist/* - - - name: Install anaconda client - if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }} - shell: bash -el {0} - run: | - conda install -q -y anaconda-client + name: ${{ matrix.python[0] }}-${{ startsWith(matrix.buildplat[1], 'macosx') && 'macosx' || matrix.buildplat[1] }} + path: ./wheelhouse/*.whl - - name: Upload sdist + - name: Upload wheels & sdist if: ${{ success() && (env.IS_SCHEDULE_DISPATCH == 'true' || env.IS_PUSH == 'true') }} shell: bash -el {0} env: PANDAS_STAGING_UPLOAD_TOKEN: ${{ secrets.PANDAS_STAGING_UPLOAD_TOKEN }} PANDAS_NIGHTLY_UPLOAD_TOKEN: ${{ secrets.PANDAS_NIGHTLY_UPLOAD_TOKEN }} + # trigger an upload to + # https://anaconda.org/scientific-python-nightly-wheels/pandas + # for cron jobs or "Run workflow" (restricted to main branch). + # Tags will upload to + # https://anaconda.org/multibuild-wheels-staging/pandas + # The tokens were originally generated at anaconda.org run: | source ci/upload_wheels.sh set_upload_vars - # trigger an upload to - # https://anaconda.org/scipy-wheels-nightly/pandas - # for cron jobs or "Run workflow" (restricted to main branch). - # Tags will upload to - # https://anaconda.org/multibuild-wheels-staging/pandas - # The tokens were originally generated at anaconda.org upload_wheels diff --git a/.gitignore b/.gitignore index 88ed58b70925d..cd22c2bb8cb5b 100644 --- a/.gitignore +++ b/.gitignore @@ -36,7 +36,10 @@ *.py[ocd] *.so .build_cache_dir +.mesonpy-native-file.ini MANIFEST +compile_commands.json +debug # Python files # ################ @@ -70,11 +73,14 @@ coverage.xml coverage_html_report .mypy_cache *.pytest_cache +.ruff_cache # hypothesis test database .hypothesis/ __pycache__ # pytest-monkeytype monkeytype.sqlite3 +# meson editable install folder +.mesonpy # OS generated files # diff --git a/.gitpod.yml b/.gitpod.yml index 8b086a589a378..0a5b5648994ae 100644 --- a/.gitpod.yml +++ b/.gitpod.yml @@ -15,6 +15,7 @@ tasks: git fetch --tags python setup.py build_ext --inplace -j 4 echo "🛠 Completed rebuilding Pandas!! 🛠 " + pre-commit install echo "✨ Pre-build complete! You can close this terminal ✨ " # -------------------------------------------------------- diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 43b3699907325..c9cd7528bcd2f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -15,20 +15,13 @@ default_stages: [ ci: autofix_prs: false repos: -- repo: local +- repo: https://github.com/hauntsaninja/black-pre-commit-mirror + # black compiled with mypyc + rev: 23.3.0 hooks: - # NOTE: we make `black` a local hook because if it's installed from - # PyPI (rather than from source) then it'll run twice as fast thanks to mypyc - - id: black - name: black - description: "Black: The uncompromising Python code formatter" - entry: black - language: python - require_serial: true - types_or: [python, pyi] - additional_dependencies: [black==23.1.0] + - id: black - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.0.259 + rev: v0.0.270 hooks: - id: ruff args: [--exit-non-zero-on-fix] @@ -40,13 +33,13 @@ repos: pass_filenames: true require_serial: false - repo: https://github.com/codespell-project/codespell - rev: v2.2.2 + rev: v2.2.4 hooks: - id: codespell types_or: [python, rst, markdown, cython, c] additional_dependencies: [tomli] - repo: https://github.com/MarcoGorelli/cython-lint - rev: v0.12.5 + rev: v0.15.0 hooks: - id: cython-lint - id: double-quote-cython-strings @@ -65,11 +58,7 @@ repos: rev: 1.6.1 hooks: - id: cpplint - # We don't lint all C files because we don't want to lint any that are built - # from Cython files nor do we want to lint C files that we didn't modify for - # this particular codebase (e.g. src/headers, src/klib). However, - # we can lint all header files since they aren't "generated" like C files are. - exclude: ^pandas/_libs/src/(klib|headers)/ + exclude: ^pandas/_libs/include/pandas/vendored/klib args: [ --quiet, '--extensions=c,h', @@ -78,14 +67,12 @@ repos: --linelength=88, '--filter=-readability/casting,-runtime/int,-build/include_subdir,-readability/fn_size' ] -- repo: https://github.com/pycqa/pylint - rev: v2.16.2 +- repo: https://github.com/pylint-dev/pylint + rev: v3.0.0a6 hooks: - id: pylint stages: [manual] -- repo: https://github.com/pycqa/pylint - rev: v2.16.2 - hooks: + args: [--load-plugins=pylint.extensions.redefined_loop_name] - id: pylint alias: redefined-outer-name name: Redefining name from outer scope @@ -104,7 +91,7 @@ repos: hooks: - id: isort - repo: https://github.com/asottile/pyupgrade - rev: v3.3.1 + rev: v3.4.0 hooks: - id: pyupgrade args: [--py38-plus] @@ -340,6 +327,11 @@ repos: language: python entry: python scripts/validate_unwanted_patterns.py --validation-type="strings_with_wrong_placed_whitespace" types_or: [python, cython] + - id: unwanted-patterns-nodefault-used-not-only-for-typing + name: Check that `pandas._libs.lib.NoDefault` is used only for typing + language: python + entry: python scripts/validate_unwanted_patterns.py --validation-type="nodefault_used_not_only_for_typing" + types: [python] - id: use-pd_array-in-core name: Import pandas.array as pd_array in core language: python diff --git a/LICENSES/OTHER b/LICENSES/OTHER index 7446d68eb43a6..e156152990cc4 100644 --- a/LICENSES/OTHER +++ b/LICENSES/OTHER @@ -26,24 +26,6 @@ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -google-api-python-client license --------------------------------- - -Copyright (C) 2012 Google Inc. -All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. - Pyperclip v1.3 license ---------------------- diff --git a/MANIFEST.in b/MANIFEST.in index 361cd8ff9ec22..9894381ed6252 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,6 +1,3 @@ -include RELEASE.md -include versioneer.py - graft doc prune doc/build @@ -10,6 +7,7 @@ graft pandas global-exclude *.bz2 global-exclude *.csv +global-exclude *.data global-exclude *.dta global-exclude *.feather global-exclude *.tar @@ -18,9 +16,12 @@ global-exclude *.h5 global-exclude *.html global-exclude *.json global-exclude *.jsonl +global-exclude *.kml global-exclude *.msgpack global-exclude *.pdf +global-exclude *.parquet global-exclude *.pickle +global-exclude *.pkl global-exclude *.png global-exclude *.pptx global-exclude *.ods @@ -29,12 +30,15 @@ global-exclude *.orc global-exclude *.sas7bdat global-exclude *.sav global-exclude *.so +global-exclude *.txt global-exclude *.xls global-exclude *.xlsb global-exclude *.xlsm global-exclude *.xlsx global-exclude *.xpt global-exclude *.cpt +global-exclude *.xml +global-exclude *.xsl global-exclude *.xz global-exclude *.zip global-exclude *.zst @@ -57,6 +61,4 @@ prune pandas/tests/io/parser/data # Selectively re-add *.cxx files that were excluded above graft pandas/_libs/src -graft pandas/_libs/tslibs/src -include pandas/_libs/pd_parser.h -include pandas/_libs/pd_parser.c +graft pandas/_libs/include diff --git a/README.md b/README.md index 9f2bc800e8479..1bff2941f86ca 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,19 @@ the broader goal of becoming **the most powerful and flexible open source data analysis / manipulation tool available in any language**. It is already well on its way towards this goal. +## Table of Contents + +- [Main Features](#main-features) +- [Where to get it](#where-to-get-it) +- [Dependencies](#dependencies) +- [Installation from sources](#installation-from-sources) +- [License](#license) +- [Documentation](#documentation) +- [Background](#background) +- [Getting Help](#getting-help) +- [Discussion and Development](#discussion-and-development) +- [Contributing to pandas](#contributing-to-pandas) + ## Main Features Here are just a few of the things that pandas does well: @@ -155,9 +168,17 @@ For usage questions, the best place to go to is [StackOverflow](https://stackove Further, general questions and discussions can also take place on the [pydata mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata). ## Discussion and Development -Most development discussions take place on GitHub in this repo. Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Slack channel](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack) is available for quick development related questions. +Most development discussions take place on GitHub in this repo, via the [GitHub issue tracker](https://github.com/pandas-dev/pandas/issues). + +Further, the [pandas-dev mailing list](https://mail.python.org/mailman/listinfo/pandas-dev) can also be used for specialized discussions or design issues, and a [Slack channel](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack) is available for quick development related questions. + +There are also frequent [community meetings](https://pandas.pydata.org/docs/dev/development/community.html#community-meeting) for project maintainers open to the community as well as monthly [new contributor meetings](https://pandas.pydata.org/docs/dev/development/community.html#new-contributor-meeting) to help support new contributors. + +Additional information on the communication channels can be found on the [contributor community](https://pandas.pydata.org/docs/development/community.html) page. -## Contributing to pandas [![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas) +## Contributing to pandas + +[![Open Source Helpers](https://www.codetriage.com/pandas-dev/pandas/badges/users.svg)](https://www.codetriage.com/pandas-dev/pandas) All contributions, bug reports, bug fixes, documentation improvements, enhancements, and ideas are welcome. @@ -172,3 +193,7 @@ Or maybe through using pandas you have an idea of your own or are looking for so Feel free to ask questions on the [mailing list](https://groups.google.com/forum/?fromgroups#!forum/pydata) or on [Slack](https://pandas.pydata.org/docs/dev/development/community.html?highlight=slack#community-slack). As contributors and maintainers to this project, you are expected to abide by pandas' code of conduct. More information can be found at: [Contributor Code of Conduct](https://github.com/pandas-dev/.github/blob/master/CODE_OF_CONDUCT.md) + +
+ +[Go to Top](#table-of-contents) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index c503ae5e17471..810764754b7e1 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -29,7 +29,7 @@ // The Pythons you'd like to test against. If not provided, defaults // to the current version of Python used to run `asv`. - "pythons": ["3.8"], + "pythons": ["3.10"], // The matrix of dependencies to test. Each key is the name of a // package (in PyPI) and the values are version numbers. An empty @@ -41,7 +41,6 @@ // pip (with all the conda available packages installed first, // followed by the pip installed packages). "matrix": { - "numpy": [], "Cython": ["0.29.33"], "matplotlib": [], "sqlalchemy": [], @@ -56,6 +55,9 @@ "xlrd": [], "odfpy": [], "jinja2": [], + "meson": [], + "meson-python": [], + "python-build": [], }, "conda_channels": ["conda-forge"], // Combinations of libraries/python versions can be excluded/included @@ -125,7 +127,5 @@ "regression_thresholds": { }, "build_command": - ["python -m pip install versioneer[toml]", - "python setup.py build -j4", - "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}"], + ["python -m build -Cbuilddir=builddir --wheel --outdir {build_cache_dir} {build_dir}"] } diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index 52c87455b12b3..c33043c0eddc1 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -24,7 +24,7 @@ class Dtypes: - params = _dtypes + list(map(lambda dt: dt.name, _dtypes)) + params = _dtypes + [dt.name for dt in _dtypes] param_names = ["dtype"] def time_pandas_dtype(self, dtype): diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 4c0f3ddd826b7..6617b3c8b4cca 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -57,6 +57,38 @@ }, } +# These aggregations don't have a kernel implemented for them yet +_numba_unsupported_methods = [ + "all", + "any", + "bfill", + "count", + "cumcount", + "cummax", + "cummin", + "cumprod", + "cumsum", + "describe", + "diff", + "ffill", + "first", + "head", + "last", + "median", + "nunique", + "pct_change", + "prod", + "quantile", + "rank", + "sem", + "shift", + "size", + "skew", + "tail", + "unique", + "value_counts", +] + class ApplyDictReturn: def setup(self): @@ -453,9 +485,10 @@ class GroupByMethods: ], ["direct", "transformation"], [1, 5], + ["cython", "numba"], ] - def setup(self, dtype, method, application, ncols): + def setup(self, dtype, method, application, ncols, engine): if method in method_blocklist.get(dtype, {}): raise NotImplementedError # skip benchmark @@ -474,6 +507,19 @@ def setup(self, dtype, method, application, ncols): # DataFrameGroupBy doesn't have these methods raise NotImplementedError + # Numba currently doesn't support + # multiple transform functions or strs for transform, + # grouping on multiple columns + # and we lack kernels for a bunch of methods + if ( + engine == "numba" + and method in _numba_unsupported_methods + or ncols > 1 + or application == "transformation" + or dtype == "datetime" + ): + raise NotImplementedError + if method == "describe": ngroups = 20 elif method == "skew": @@ -505,17 +551,30 @@ def setup(self, dtype, method, application, ncols): if len(cols) == 1: cols = cols[0] + # Not everything supports the engine keyword yet + kwargs = {} + if engine == "numba": + kwargs["engine"] = engine + if application == "transformation": - self.as_group_method = lambda: df.groupby("key")[cols].transform(method) - self.as_field_method = lambda: df.groupby(cols)["key"].transform(method) + self.as_group_method = lambda: df.groupby("key")[cols].transform( + method, **kwargs + ) + self.as_field_method = lambda: df.groupby(cols)["key"].transform( + method, **kwargs + ) else: - self.as_group_method = getattr(df.groupby("key")[cols], method) - self.as_field_method = getattr(df.groupby(cols)["key"], method) + self.as_group_method = partial( + getattr(df.groupby("key")[cols], method), **kwargs + ) + self.as_field_method = partial( + getattr(df.groupby(cols)["key"], method), **kwargs + ) - def time_dtype_as_group(self, dtype, method, application, ncols): + def time_dtype_as_group(self, dtype, method, application, ncols, engine): self.as_group_method() - def time_dtype_as_field(self, dtype, method, application, ncols): + def time_dtype_as_field(self, dtype, method, application, ncols, engine): self.as_field_method() @@ -532,8 +591,12 @@ class GroupByCythonAgg: [ "sum", "prod", - "min", - "max", + # TODO: uncomment min/max + # Currently, min/max implemented very inefficiently + # because it re-uses the Window min/max kernel + # so it will time out ASVs + # "min", + # "max", "mean", "median", "var", @@ -554,6 +617,22 @@ def time_frame_agg(self, dtype, method): self.df.groupby("key").agg(method) +class GroupByNumbaAgg(GroupByCythonAgg): + """ + Benchmarks specifically targeting our numba aggregation algorithms + (using a big enough dataframe with simple key, so a large part of the + time is actually spent in the grouped aggregation). + """ + + def setup(self, dtype, method): + if method in _numba_unsupported_methods: + raise NotImplementedError + super().setup(dtype, method) + + def time_frame_agg(self, dtype, method): + self.df.groupby("key").agg(method, engine="numba") + + class GroupByCythonAggEaDtypes: """ Benchmarks specifically targeting our cython aggregation algorithms diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 53827cfcf64fb..84d95a23bd446 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -3,6 +3,7 @@ lower-level methods directly on Index and subclasses, see index_object.py, indexing_engine.py, and index_cached.py """ +from datetime import datetime import warnings import numpy as np @@ -531,4 +532,25 @@ def time_chained_indexing(self, mode): df2["C"] = 1.0 +class Block: + params = [ + (True, "True"), + (np.array(True), "np.array(True)"), + ] + + def setup(self, true_value, mode): + self.df = DataFrame( + False, + columns=np.arange(500).astype(str), + index=date_range("2010-01-01", "2011-01-01"), + ) + + self.true_value = true_value + + def time_test(self, true_value, mode): + start = datetime(2010, 5, 1) + end = datetime(2010, 9, 1) + self.df.loc[start:end, :] = true_value + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/io/csv.py b/asv_bench/benchmarks/io/csv.py index 36301d22db5d3..86a983d3deb62 100644 --- a/asv_bench/benchmarks/io/csv.py +++ b/asv_bench/benchmarks/io/csv.py @@ -12,6 +12,7 @@ DataFrame, concat, date_range, + period_range, read_csv, to_datetime, ) @@ -98,24 +99,76 @@ def time_frame_date_no_format_index(self): self.data.to_csv(self.fname) +class ToCSVPeriod(BaseIO): + fname = "__test__.csv" + + params = ([1000, 10000], ["D", "H"]) + param_names = ["nobs", "freq"] + + def setup(self, nobs, freq): + rng = period_range(start="2000-01-01", periods=nobs, freq=freq) + self.data = DataFrame(rng) + if freq == "D": + self.default_fmt = "%Y-%m-%d" + elif freq == "H": + self.default_fmt = "%Y-%m-%d %H:00" + + def time_frame_period_formatting_default(self, nobs, freq): + self.data.to_csv(self.fname) + + def time_frame_period_formatting_default_explicit(self, nobs, freq): + self.data.to_csv(self.fname, date_format=self.default_fmt) + + def time_frame_period_formatting(self, nobs, freq): + # Nb: `date_format` is not actually taken into account here today, so the + # performance is currently identical to `time_frame_period_formatting_default` + # above. This timer is therefore expected to degrade when GH#51621 is fixed. + # (Remove this comment when GH#51621 is fixed.) + self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S") + + +class ToCSVPeriodIndex(BaseIO): + fname = "__test__.csv" + + params = ([1000, 10000], ["D", "H"]) + param_names = ["nobs", "freq"] + + def setup(self, nobs, freq): + rng = period_range(start="2000-01-01", periods=nobs, freq=freq) + self.data = DataFrame({"a": 1}, index=rng) + if freq == "D": + self.default_fmt = "%Y-%m-%d" + elif freq == "H": + self.default_fmt = "%Y-%m-%d %H:00" + + def time_frame_period_formatting_index(self, nobs, freq): + self.data.to_csv(self.fname, date_format="%Y-%m-%d___%H:%M:%S") + + def time_frame_period_formatting_index_default(self, nobs, freq): + self.data.to_csv(self.fname) + + def time_frame_period_formatting_index_default_explicit(self, nobs, freq): + self.data.to_csv(self.fname, date_format=self.default_fmt) + + class ToCSVDatetimeBig(BaseIO): fname = "__test__.csv" timeout = 1500 params = [1000, 10000, 100000] - param_names = ["obs"] + param_names = ["nobs"] - def setup(self, obs): + def setup(self, nobs): d = "2018-11-29" dt = "2018-11-26 11:18:27.0" self.data = DataFrame( { - "dt": [np.datetime64(dt)] * obs, - "d": [np.datetime64(d)] * obs, - "r": [np.random.uniform()] * obs, + "dt": [np.datetime64(dt)] * nobs, + "d": [np.datetime64(d)] * nobs, + "r": [np.random.uniform()] * nobs, } ) - def time_frame(self, obs): + def time_frame(self, nobs): self.data.to_csv(self.fname) @@ -444,7 +497,7 @@ class ReadCSVMemoryGrowth(BaseIO): param_names = ["engine"] def setup(self, engine): - with open(self.fname, "w") as f: + with open(self.fname, "w", encoding="utf-8") as f: for i in range(self.num_rows): f.write(f"{i}\n") diff --git a/asv_bench/benchmarks/join_merge.py b/asv_bench/benchmarks/join_merge.py index cd6d091334ae2..ebdf91bf35455 100644 --- a/asv_bench/benchmarks/join_merge.py +++ b/asv_bench/benchmarks/join_merge.py @@ -324,6 +324,38 @@ def time_i8merge(self, how): merge(self.left, self.right, how=how) +class MergeDatetime: + params = [ + [ + ("ns", "ns"), + ("ms", "ms"), + ("ns", "ms"), + ], + [None, "Europe/Brussels"], + ] + param_names = ["units", "tz"] + + def setup(self, units, tz): + unit_left, unit_right = units + N = 10_000 + keys = Series(date_range("2012-01-01", freq="T", periods=N, tz=tz)) + self.left = DataFrame( + { + "key": keys.sample(N * 10, replace=True).dt.as_unit(unit_left), + "value1": np.random.randn(N * 10), + } + ) + self.right = DataFrame( + { + "key": keys[:8000].dt.as_unit(unit_right), + "value2": np.random.randn(8000), + } + ) + + def time_merge(self, units, tz): + merge(self.left, self.right) + + class MergeCategoricals: def setup(self): self.left_object = DataFrame( diff --git a/asv_bench/benchmarks/multiindex_object.py b/asv_bench/benchmarks/multiindex_object.py index 9c997b5386eaa..87dcdb16fa647 100644 --- a/asv_bench/benchmarks/multiindex_object.py +++ b/asv_bench/benchmarks/multiindex_object.py @@ -396,4 +396,30 @@ def time_putmask_all_different(self): self.midx.putmask(self.mask, self.midx_values_different) +class Append: + params = ["datetime64[ns]", "int64", "string"] + param_names = ["dtype"] + + def setup(self, dtype): + N1 = 1000 + N2 = 500 + left_level1 = range(N1) + right_level1 = range(N1, N1 + N1) + + if dtype == "datetime64[ns]": + level2 = date_range(start="2000-01-01", periods=N2) + elif dtype == "int64": + level2 = range(N2) + elif dtype == "string": + level2 = tm.makeStringIndex(N2) + else: + raise NotImplementedError + + self.left = MultiIndex.from_product([left_level1, level2]) + self.right = MultiIndex.from_product([right_level1, level2]) + + def time_append(self, dtype): + self.left.append(self.right) + + from .pandas_vb_common import setup # noqa: F401 isort:skip diff --git a/asv_bench/benchmarks/pandas_vb_common.py b/asv_bench/benchmarks/pandas_vb_common.py index 97d91111e833a..4bd56ccb1b5ce 100644 --- a/asv_bench/benchmarks/pandas_vb_common.py +++ b/asv_bench/benchmarks/pandas_vb_common.py @@ -17,7 +17,7 @@ try: import pandas._testing as tm except ImportError: - import pandas.util.testing as tm # noqa:F401 + import pandas.util.testing as tm # noqa: F401 numeric_dtypes = [ diff --git a/asv_bench/benchmarks/strftime.py b/asv_bench/benchmarks/strftime.py index ac1b7f65d2d90..39cc82e1bdf79 100644 --- a/asv_bench/benchmarks/strftime.py +++ b/asv_bench/benchmarks/strftime.py @@ -7,58 +7,109 @@ class DatetimeStrftime: timeout = 1500 params = [1000, 10000] - param_names = ["obs"] + param_names = ["nobs"] - def setup(self, obs): + def setup(self, nobs): d = "2018-11-29" dt = "2018-11-26 11:18:27.0" self.data = pd.DataFrame( { - "dt": [np.datetime64(dt)] * obs, - "d": [np.datetime64(d)] * obs, - "r": [np.random.uniform()] * obs, + "dt": [np.datetime64(dt)] * nobs, + "d": [np.datetime64(d)] * nobs, + "r": [np.random.uniform()] * nobs, } ) - def time_frame_date_to_str(self, obs): + def time_frame_date_to_str(self, nobs): self.data["d"].astype(str) - def time_frame_date_formatting_default(self, obs): + def time_frame_date_formatting_default(self, nobs): + self.data["d"].dt.strftime(date_format=None) + + def time_frame_date_formatting_default_explicit(self, nobs): self.data["d"].dt.strftime(date_format="%Y-%m-%d") - def time_frame_date_formatting_custom(self, obs): + def time_frame_date_formatting_custom(self, nobs): self.data["d"].dt.strftime(date_format="%Y---%m---%d") - def time_frame_datetime_to_str(self, obs): + def time_frame_datetime_to_str(self, nobs): self.data["dt"].astype(str) - def time_frame_datetime_formatting_default_date_only(self, obs): + def time_frame_datetime_formatting_default(self, nobs): + self.data["dt"].dt.strftime(date_format=None) + + def time_frame_datetime_formatting_default_explicit_date_only(self, nobs): self.data["dt"].dt.strftime(date_format="%Y-%m-%d") - def time_frame_datetime_formatting_default(self, obs): + def time_frame_datetime_formatting_default_explicit(self, nobs): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S") - def time_frame_datetime_formatting_default_with_float(self, obs): + def time_frame_datetime_formatting_default_with_float(self, nobs): self.data["dt"].dt.strftime(date_format="%Y-%m-%d %H:%M:%S.%f") - def time_frame_datetime_formatting_custom(self, obs): + def time_frame_datetime_formatting_custom(self, nobs): self.data["dt"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") +class PeriodStrftime: + timeout = 1500 + params = ([1000, 10000], ["D", "H"]) + param_names = ["nobs", "freq"] + + def setup(self, nobs, freq): + self.data = pd.DataFrame( + { + "p": pd.period_range(start="2000-01-01", periods=nobs, freq=freq), + "r": [np.random.uniform()] * nobs, + } + ) + self.data["i"] = self.data["p"] + self.data.set_index("i", inplace=True) + if freq == "D": + self.default_fmt = "%Y-%m-%d" + elif freq == "H": + self.default_fmt = "%Y-%m-%d %H:00" + + def time_frame_period_to_str(self, nobs, freq): + self.data["p"].astype(str) + + def time_frame_period_formatting_default(self, nobs, freq): + self.data["p"].dt.strftime(date_format=None) + + def time_frame_period_formatting_default_explicit(self, nobs, freq): + self.data["p"].dt.strftime(date_format=self.default_fmt) + + def time_frame_period_formatting_index_default(self, nobs, freq): + self.data.index.format() + + def time_frame_period_formatting_index_default_explicit(self, nobs, freq): + self.data.index.format(self.default_fmt) + + def time_frame_period_formatting_custom(self, nobs, freq): + self.data["p"].dt.strftime(date_format="%Y-%m-%d --- %H:%M:%S") + + def time_frame_period_formatting_iso8601_strftime_Z(self, nobs, freq): + self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%SZ") + + def time_frame_period_formatting_iso8601_strftime_offset(self, nobs, freq): + """Not optimized yet as %z is not supported by `convert_strftime_format`""" + self.data["p"].dt.strftime(date_format="%Y-%m-%dT%H:%M:%S%z") + + class BusinessHourStrftime: timeout = 1500 params = [1000, 10000] - param_names = ["obs"] + param_names = ["nobs"] - def setup(self, obs): + def setup(self, nobs): self.data = pd.DataFrame( { - "off": [offsets.BusinessHour()] * obs, + "off": [offsets.BusinessHour()] * nobs, } ) - def time_frame_offset_str(self, obs): + def time_frame_offset_str(self, nobs): self.data["off"].apply(str) - def time_frame_offset_repr(self, obs): + def time_frame_offset_repr(self, nobs): self.data["off"].apply(repr) diff --git a/asv_bench/benchmarks/tslibs/period.py b/asv_bench/benchmarks/tslibs/period.py index 2d192889c39f3..a92fbbe8d4dbe 100644 --- a/asv_bench/benchmarks/tslibs/period.py +++ b/asv_bench/benchmarks/tslibs/period.py @@ -60,6 +60,10 @@ class PeriodUnaryMethods: def setup(self, freq): self.per = Period("2012-06-01", freq=freq) + if freq == "M": + self.default_fmt = "%Y-%m" + elif freq == "min": + self.default_fmt = "%Y-%m-%d %H:%M" def time_to_timestamp(self, freq): self.per.to_timestamp() @@ -70,6 +74,21 @@ def time_now(self, freq): def time_asfreq(self, freq): self.per.asfreq("A") + def time_str(self, freq): + str(self.per) + + def time_repr(self, freq): + repr(self.per) + + def time_strftime_default(self, freq): + self.per.strftime(None) + + def time_strftime_default_explicit(self, freq): + self.per.strftime(self.default_fmt) + + def time_strftime_custom(self, freq): + self.per.strftime("%b. %d, %Y was a %A") + class PeriodConstructor: params = [["D"], [True, False]] diff --git a/ci/code_checks.sh b/ci/code_checks.sh index c046d55d80b49..756096a7fe345 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -21,23 +21,6 @@ BASE_DIR="$(dirname $0)/.." RET=0 CHECK=$1 -function invgrep { - # grep with inverse exist status and formatting for azure-pipelines - # - # This function works exactly as grep, but with opposite exit status: - # - 0 (success) when no patterns are found - # - 1 (fail) when the patterns are found - # - # This is useful for the CI, as we want to fail if one of the patterns - # that we want to avoid is found by grep. - grep -n "$@" | sed "s/^/$INVGREP_PREPEND/" | sed "s/$/$INVGREP_APPEND/" ; EXIT_STATUS=${PIPESTATUS[0]} - return $((! $EXIT_STATUS)) -} - -if [[ "$GITHUB_ACTIONS" == "true" ]]; then - INVGREP_PREPEND="##[error]" -fi - ### CODE ### if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then @@ -65,13 +48,8 @@ fi ### DOCTESTS ### if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then - MSG='Doctests' ; echo $MSG - # Ignore test_*.py files or else the unit tests will run - python -m pytest --doctest-modules --ignore-glob="**/test_*.py" pandas - RET=$(($RET + $?)) ; echo $MSG "DONE" - - MSG='Cython Doctests' ; echo $MSG - python -m pytest --doctest-cython pandas/_libs + MSG='Python and Cython Doctests' ; echo $MSG + python -c 'import pandas as pd; pd.test(run_doctests=True)' RET=$(($RET + $?)) ; echo $MSG "DONE" fi @@ -85,74 +63,14 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then MSG='Partially validate docstrings (EX01)' ; echo $MSG $BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX01 --ignore_functions \ - pandas.Series.__iter__ \ - pandas.Series.keys \ - pandas.Series.item \ - pandas.Series.pipe \ - pandas.Series.mode \ - pandas.Series.is_unique \ - pandas.Series.is_monotonic_increasing \ - pandas.Series.is_monotonic_decreasing \ pandas.Series.backfill \ - pandas.Series.bfill \ - pandas.Series.ffill \ pandas.Series.pad \ - pandas.Series.argsort \ - pandas.Series.reorder_levels \ - pandas.Series.ravel \ - pandas.Series.first_valid_index \ - pandas.Series.last_valid_index \ - pandas.Series.dt.date \ - pandas.Series.dt.time \ - pandas.Series.dt.timetz \ - pandas.Series.dt.dayofyear \ - pandas.Series.dt.day_of_year \ - pandas.Series.dt.quarter \ - pandas.Series.dt.daysinmonth \ - pandas.Series.dt.days_in_month \ - pandas.Series.dt.tz \ - pandas.Series.dt.end_time \ - pandas.Series.dt.days \ - pandas.Series.dt.seconds \ - pandas.Series.dt.microseconds \ - pandas.Series.dt.nanoseconds \ - pandas.Series.str.center \ - pandas.Series.str.decode \ - pandas.Series.str.encode \ - pandas.Series.str.find \ - pandas.Series.str.fullmatch \ - pandas.Series.str.index \ - pandas.Series.str.ljust \ - pandas.Series.str.match \ - pandas.Series.str.normalize \ - pandas.Series.str.rfind \ - pandas.Series.str.rindex \ - pandas.Series.str.rjust \ - pandas.Series.str.translate \ - pandas.Series.sparse \ - pandas.DataFrame.sparse \ - pandas.Series.cat.categories \ - pandas.Series.cat.ordered \ - pandas.Series.cat.codes \ - pandas.Series.cat.reorder_categories \ - pandas.Series.cat.set_categories \ - pandas.Series.cat.as_ordered \ - pandas.Series.cat.as_unordered \ - pandas.Series.sparse.fill_value \ - pandas.Flags \ - pandas.Series.attrs \ - pandas.Series.plot \ pandas.Series.hist \ - pandas.Series.to_string \ - pandas.errors.AbstractMethodError \ pandas.errors.AccessorRegistrationWarning \ pandas.errors.AttributeConflictWarning \ pandas.errors.DataError \ - pandas.errors.EmptyDataError \ pandas.errors.IncompatibilityWarning \ pandas.errors.InvalidComparison \ - pandas.errors.InvalidIndexError \ - pandas.errors.InvalidVersion \ pandas.errors.IntCastingNaNError \ pandas.errors.LossySetitemError \ pandas.errors.MergeError \ @@ -168,99 +86,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.errors.PyperclipWindowsException \ pandas.errors.UnsortedIndexError \ pandas.errors.UnsupportedFunctionCall \ - pandas.show_versions \ pandas.test \ pandas.NaT \ - pandas.Timestamp.as_unit \ - pandas.Timestamp.ctime \ - pandas.Timestamp.date \ - pandas.Timestamp.dst \ - pandas.Timestamp.isocalendar \ - pandas.Timestamp.isoweekday \ - pandas.Timestamp.strptime \ - pandas.Timestamp.time \ - pandas.Timestamp.timetuple \ - pandas.Timestamp.timetz \ - pandas.Timestamp.to_datetime64 \ - pandas.Timestamp.toordinal \ - pandas.Timestamp.tzname \ - pandas.Timestamp.utcoffset \ - pandas.Timestamp.utctimetuple \ - pandas.Timestamp.weekday \ - pandas.arrays.DatetimeArray \ - pandas.Timedelta.view \ - pandas.Timedelta.as_unit \ - pandas.Timedelta.ceil \ - pandas.Timedelta.floor \ - pandas.Timedelta.round \ - pandas.Timedelta.to_pytimedelta \ - pandas.Timedelta.to_timedelta64 \ - pandas.Timedelta.to_numpy \ - pandas.Timedelta.total_seconds \ - pandas.arrays.TimedeltaArray \ - pandas.Period.end_time \ - pandas.Period.freqstr \ - pandas.Period.is_leap_year \ - pandas.Period.month \ - pandas.Period.quarter \ - pandas.Period.year \ - pandas.Period.asfreq \ - pandas.Period.now \ - pandas.arrays.PeriodArray \ - pandas.Interval.closed \ - pandas.Interval.left \ - pandas.Interval.length \ - pandas.Interval.right \ - pandas.arrays.IntervalArray.left \ - pandas.arrays.IntervalArray.right \ - pandas.arrays.IntervalArray.closed \ - pandas.arrays.IntervalArray.mid \ - pandas.arrays.IntervalArray.length \ - pandas.arrays.IntervalArray.is_non_overlapping_monotonic \ - pandas.arrays.IntervalArray.from_arrays \ - pandas.arrays.IntervalArray.to_tuples \ - pandas.Int8Dtype \ - pandas.Int16Dtype \ - pandas.Int32Dtype \ - pandas.Int64Dtype \ - pandas.UInt8Dtype \ - pandas.UInt16Dtype \ - pandas.UInt32Dtype \ - pandas.UInt64Dtype \ - pandas.NA \ - pandas.Float32Dtype \ - pandas.Float64Dtype \ - pandas.CategoricalDtype.categories \ - pandas.CategoricalDtype.ordered \ - pandas.Categorical.dtype \ - pandas.Categorical.categories \ - pandas.Categorical.ordered \ - pandas.Categorical.codes \ - pandas.Categorical.__array__ \ - pandas.SparseDtype \ - pandas.DatetimeTZDtype.unit \ - pandas.DatetimeTZDtype.tz \ - pandas.PeriodDtype.freq \ - pandas.IntervalDtype.subtype \ - pandas_dtype \ - pandas.api.types.is_bool \ - pandas.api.types.is_complex \ - pandas.api.types.is_float \ - pandas.api.types.is_integer \ - pandas.api.types.pandas_dtype \ - pandas.read_clipboard \ - pandas.ExcelFile \ - pandas.ExcelFile.parse \ - pandas.DataFrame.to_html \ pandas.io.formats.style.Styler.to_html \ - pandas.HDFStore.put \ - pandas.HDFStore.append \ - pandas.HDFStore.get \ - pandas.HDFStore.select \ - pandas.HDFStore.info \ - pandas.HDFStore.keys \ - pandas.HDFStore.groups \ - pandas.HDFStore.walk \ pandas.read_feather \ pandas.DataFrame.to_feather \ pandas.read_parquet \ @@ -273,30 +101,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.io.stata.StataReader.value_labels \ pandas.io.stata.StataReader.variable_labels \ pandas.io.stata.StataWriter.write_file \ - pandas.core.resample.Resampler.__iter__ \ - pandas.core.resample.Resampler.groups \ - pandas.core.resample.Resampler.indices \ - pandas.core.resample.Resampler.get_group \ - pandas.core.resample.Resampler.ffill \ - pandas.core.resample.Resampler.asfreq \ - pandas.core.resample.Resampler.count \ - pandas.core.resample.Resampler.nunique \ - pandas.core.resample.Resampler.max \ - pandas.core.resample.Resampler.mean \ - pandas.core.resample.Resampler.median \ - pandas.core.resample.Resampler.min \ - pandas.core.resample.Resampler.ohlc \ - pandas.core.resample.Resampler.prod \ - pandas.core.resample.Resampler.size \ - pandas.core.resample.Resampler.sem \ - pandas.core.resample.Resampler.std \ - pandas.core.resample.Resampler.sum \ - pandas.core.resample.Resampler.var \ - pandas.core.resample.Resampler.quantile \ - pandas.describe_option \ - pandas.reset_option \ - pandas.get_option \ - pandas.set_option \ pandas.plotting.deregister_matplotlib_converters \ pandas.plotting.plot_params \ pandas.plotting.register_matplotlib_converters \ @@ -305,189 +109,9 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.util.hash_pandas_object \ pandas_object \ pandas.api.interchange.from_dataframe \ - pandas.Index.values \ - pandas.Index.dtype \ - pandas.Index.inferred_type \ - pandas.Index.shape \ - pandas.Index.name \ - pandas.Index.nbytes \ - pandas.Index.ndim \ - pandas.Index.size \ - pandas.Index.T \ - pandas.Index.memory_usage \ - pandas.Index.copy \ - pandas.Index.drop \ - pandas.Index.identical \ - pandas.Index.insert \ - pandas.Index.is_ \ - pandas.Index.take \ - pandas.Index.putmask \ - pandas.Index.unique \ - pandas.Index.fillna \ - pandas.Index.dropna \ - pandas.Index.astype \ - pandas.Index.item \ - pandas.Index.map \ - pandas.Index.ravel \ - pandas.Index.to_list \ - pandas.Index.append \ - pandas.Index.join \ - pandas.Index.asof_locs \ - pandas.Index.get_slice_bound \ - pandas.RangeIndex \ - pandas.RangeIndex.start \ - pandas.RangeIndex.stop \ - pandas.RangeIndex.step \ - pandas.RangeIndex.from_range \ - pandas.CategoricalIndex.codes \ - pandas.CategoricalIndex.categories \ - pandas.CategoricalIndex.ordered \ - pandas.CategoricalIndex.reorder_categories \ - pandas.CategoricalIndex.set_categories \ - pandas.CategoricalIndex.as_ordered \ - pandas.CategoricalIndex.as_unordered \ - pandas.CategoricalIndex.equals \ - pandas.IntervalIndex.closed \ - pandas.IntervalIndex.values \ - pandas.IntervalIndex.is_non_overlapping_monotonic \ - pandas.IntervalIndex.to_tuples \ - pandas.MultiIndex.dtypes \ - pandas.MultiIndex.drop \ - pandas.DatetimeIndex \ - pandas.DatetimeIndex.date \ - pandas.DatetimeIndex.time \ - pandas.DatetimeIndex.timetz \ - pandas.DatetimeIndex.dayofyear \ - pandas.DatetimeIndex.day_of_year \ - pandas.DatetimeIndex.quarter \ - pandas.DatetimeIndex.tz \ - pandas.DatetimeIndex.freqstr \ - pandas.DatetimeIndex.inferred_freq \ - pandas.DatetimeIndex.indexer_at_time \ - pandas.DatetimeIndex.indexer_between_time \ pandas.DatetimeIndex.snap \ - pandas.DatetimeIndex.as_unit \ - pandas.DatetimeIndex.to_pydatetime \ - pandas.DatetimeIndex.to_series \ - pandas.DatetimeIndex.mean \ - pandas.DatetimeIndex.std \ - pandas.TimedeltaIndex \ - pandas.TimedeltaIndex.days \ - pandas.TimedeltaIndex.seconds \ - pandas.TimedeltaIndex.microseconds \ - pandas.TimedeltaIndex.nanoseconds \ - pandas.TimedeltaIndex.components \ - pandas.TimedeltaIndex.inferred_freq \ - pandas.TimedeltaIndex.as_unit \ - pandas.TimedeltaIndex.to_pytimedelta \ - pandas.TimedeltaIndex.mean \ - pandas.PeriodIndex.day \ - pandas.PeriodIndex.dayofweek \ - pandas.PeriodIndex.day_of_week \ - pandas.PeriodIndex.dayofyear \ - pandas.PeriodIndex.day_of_year \ - pandas.PeriodIndex.days_in_month \ - pandas.PeriodIndex.daysinmonth \ - pandas.PeriodIndex.end_time \ - pandas.PeriodIndex.freqstr \ - pandas.PeriodIndex.hour \ - pandas.PeriodIndex.is_leap_year \ - pandas.PeriodIndex.minute \ - pandas.PeriodIndex.month \ - pandas.PeriodIndex.quarter \ - pandas.PeriodIndex.second \ - pandas.PeriodIndex.week \ - pandas.PeriodIndex.weekday \ - pandas.PeriodIndex.weekofyear \ - pandas.PeriodIndex.year \ - pandas.PeriodIndex.to_timestamp \ - pandas.core.window.rolling.Rolling.max \ - pandas.core.window.rolling.Rolling.cov \ - pandas.core.window.rolling.Rolling.skew \ - pandas.core.window.rolling.Rolling.apply \ - pandas.core.window.rolling.Window.mean \ - pandas.core.window.rolling.Window.sum \ - pandas.core.window.rolling.Window.var \ - pandas.core.window.rolling.Window.std \ - pandas.core.window.expanding.Expanding.count \ - pandas.core.window.expanding.Expanding.sum \ - pandas.core.window.expanding.Expanding.mean \ - pandas.core.window.expanding.Expanding.median \ - pandas.core.window.expanding.Expanding.min \ - pandas.core.window.expanding.Expanding.max \ - pandas.core.window.expanding.Expanding.corr \ - pandas.core.window.expanding.Expanding.cov \ - pandas.core.window.expanding.Expanding.skew \ - pandas.core.window.expanding.Expanding.apply \ - pandas.core.window.expanding.Expanding.quantile \ - pandas.core.window.ewm.ExponentialMovingWindow.mean \ - pandas.core.window.ewm.ExponentialMovingWindow.sum \ - pandas.core.window.ewm.ExponentialMovingWindow.std \ - pandas.core.window.ewm.ExponentialMovingWindow.var \ - pandas.core.window.ewm.ExponentialMovingWindow.corr \ - pandas.core.window.ewm.ExponentialMovingWindow.cov \ pandas.api.indexers.BaseIndexer \ pandas.api.indexers.VariableOffsetWindowIndexer \ - pandas.core.groupby.DataFrameGroupBy.__iter__ \ - pandas.core.groupby.SeriesGroupBy.__iter__ \ - pandas.core.groupby.DataFrameGroupBy.groups \ - pandas.core.groupby.SeriesGroupBy.groups \ - pandas.core.groupby.DataFrameGroupBy.indices \ - pandas.core.groupby.SeriesGroupBy.indices \ - pandas.core.groupby.DataFrameGroupBy.get_group \ - pandas.core.groupby.SeriesGroupBy.get_group \ - pandas.core.groupby.DataFrameGroupBy.all \ - pandas.core.groupby.DataFrameGroupBy.any \ - pandas.core.groupby.DataFrameGroupBy.bfill \ - pandas.core.groupby.DataFrameGroupBy.count \ - pandas.core.groupby.DataFrameGroupBy.cummax \ - pandas.core.groupby.DataFrameGroupBy.cummin \ - pandas.core.groupby.DataFrameGroupBy.cumprod \ - pandas.core.groupby.DataFrameGroupBy.cumsum \ - pandas.core.groupby.DataFrameGroupBy.diff \ - pandas.core.groupby.DataFrameGroupBy.ffill \ - pandas.core.groupby.DataFrameGroupBy.max \ - pandas.core.groupby.DataFrameGroupBy.median \ - pandas.core.groupby.DataFrameGroupBy.min \ - pandas.core.groupby.DataFrameGroupBy.ohlc \ - pandas.core.groupby.DataFrameGroupBy.pct_change \ - pandas.core.groupby.DataFrameGroupBy.prod \ - pandas.core.groupby.DataFrameGroupBy.sem \ - pandas.core.groupby.DataFrameGroupBy.shift \ - pandas.core.groupby.DataFrameGroupBy.size \ - pandas.core.groupby.DataFrameGroupBy.skew \ - pandas.core.groupby.DataFrameGroupBy.std \ - pandas.core.groupby.DataFrameGroupBy.sum \ - pandas.core.groupby.DataFrameGroupBy.var \ - pandas.core.groupby.SeriesGroupBy.all \ - pandas.core.groupby.SeriesGroupBy.any \ - pandas.core.groupby.SeriesGroupBy.bfill \ - pandas.core.groupby.SeriesGroupBy.count \ - pandas.core.groupby.SeriesGroupBy.cummax \ - pandas.core.groupby.SeriesGroupBy.cummin \ - pandas.core.groupby.SeriesGroupBy.cumprod \ - pandas.core.groupby.SeriesGroupBy.cumsum \ - pandas.core.groupby.SeriesGroupBy.diff \ - pandas.core.groupby.SeriesGroupBy.ffill \ - pandas.core.groupby.SeriesGroupBy.is_monotonic_increasing \ - pandas.core.groupby.SeriesGroupBy.is_monotonic_decreasing \ - pandas.core.groupby.SeriesGroupBy.max \ - pandas.core.groupby.SeriesGroupBy.median \ - pandas.core.groupby.SeriesGroupBy.min \ - pandas.core.groupby.SeriesGroupBy.nunique \ - pandas.core.groupby.SeriesGroupBy.ohlc \ - pandas.core.groupby.SeriesGroupBy.pct_change \ - pandas.core.groupby.SeriesGroupBy.prod \ - pandas.core.groupby.SeriesGroupBy.sem \ - pandas.core.groupby.SeriesGroupBy.shift \ - pandas.core.groupby.SeriesGroupBy.size \ - pandas.core.groupby.SeriesGroupBy.skew \ - pandas.core.groupby.SeriesGroupBy.std \ - pandas.core.groupby.SeriesGroupBy.sum \ - pandas.core.groupby.SeriesGroupBy.var \ - pandas.core.groupby.SeriesGroupBy.hist \ - pandas.core.groupby.DataFrameGroupBy.plot \ - pandas.core.groupby.SeriesGroupBy.plot \ pandas.io.formats.style.Styler \ pandas.io.formats.style.Styler.from_custom_template \ pandas.io.formats.style.Styler.set_caption \ @@ -521,6 +145,7 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.api.extensions.ExtensionArray.factorize \ pandas.api.extensions.ExtensionArray.fillna \ pandas.api.extensions.ExtensionArray.insert \ + pandas.api.extensions.ExtensionArray.interpolate \ pandas.api.extensions.ExtensionArray.isin \ pandas.api.extensions.ExtensionArray.isna \ pandas.api.extensions.ExtensionArray.ravel \ @@ -532,19 +157,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then pandas.api.extensions.ExtensionArray.ndim \ pandas.api.extensions.ExtensionArray.shape \ pandas.api.extensions.ExtensionArray.tolist \ - pandas.DataFrame.index \ pandas.DataFrame.columns \ - pandas.DataFrame.__iter__ \ - pandas.DataFrame.keys \ - pandas.DataFrame.iterrows \ - pandas.DataFrame.pipe \ pandas.DataFrame.backfill \ - pandas.DataFrame.bfill \ pandas.DataFrame.ffill \ pandas.DataFrame.pad \ pandas.DataFrame.swapaxes \ - pandas.DataFrame.first_valid_index \ - pandas.DataFrame.last_valid_index \ pandas.DataFrame.attrs \ pandas.DataFrame.plot \ pandas.DataFrame.to_gbq \ diff --git a/ci/condarc.yml b/ci/condarc.yml index 9d750b7102c39..f5fb60b208a9c 100644 --- a/ci/condarc.yml +++ b/ci/condarc.yml @@ -11,7 +11,7 @@ always_yes: true # The number seconds conda will wait for your client to establish a # connection to a remote url resource. # -remote_connect_timeout_secs: 30.0 +remote_connect_timeout_secs: 30 # remote_max_retries (int) # The maximum number of retries each HTTP connection should attempt. diff --git a/ci/deps/actions-310-numpydev.yaml b/ci/deps/actions-310-numpydev.yaml deleted file mode 100644 index c39289d38c211..0000000000000 --- a/ci/deps/actions-310-numpydev.yaml +++ /dev/null @@ -1,28 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.10 - - # build dependencies - - versioneer[toml] - - # test dependencies - - pytest>=7.0.0 - - pytest-cov - - pytest-xdist>=2.2.0 - - hypothesis>=6.46.1 - - pytest-asyncio>=0.17.0 - - # pandas dependencies - - python-dateutil - - pytz - - pip - - - pip: - - "cython" - - "--extra-index-url https://pypi.anaconda.org/scipy-wheels-nightly/simple" - - "--pre" - - "numpy" - - "scipy" - - "tzdata>=2022.1" diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index f159b71a1b48c..ffa7732c604a0 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -7,12 +7,15 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 + - meson[ninja]=1.0.1 + - meson-python=0.13.1 # test dependencies - - pytest>=7.0.0 + - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies @@ -32,12 +35,12 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 + - matplotlib>=3.6.1 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 - qtpy>=2.2.0 - - openpyxl<3.1.1, >=3.0.10 + - openpyxl>=3.0.10 - pandas-gbq>=0.17.5 - psycopg2>=2.9.3 - pyarrow>=7.0.0 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml similarity index 89% rename from ci/deps/actions-38-downstream_compat.yaml rename to ci/deps/actions-311-downstream_compat.yaml index dbfcc535fe3fb..596f3476c9c4e 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -3,17 +3,20 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8 + - python=3.11 # build dependencies - versioneer[toml] - cython>=0.29.33 + - meson[ninja]=1.0.1 + - meson-python=0.13.1 # test dependencies - - pytest>=7.0.0 + - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies @@ -33,12 +36,12 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 + - matplotlib>=3.6.1 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 - qtpy>=2.2.0 - - openpyxl<3.1.1, >=3.0.10 + - openpyxl>=3.0.10 - pandas-gbq>=0.17.5 - psycopg2>=2.9.3 - pyarrow>=7.0.0 @@ -69,7 +72,6 @@ dependencies: - pandas-datareader - pyyaml - py - - pip: - pyqt5>=5.15.6 - tzdata>=2022.1 diff --git a/ci/deps/actions-311-numpydev.yaml b/ci/deps/actions-311-numpydev.yaml new file mode 100644 index 0000000000000..2cd4d5f3528f8 --- /dev/null +++ b/ci/deps/actions-311-numpydev.yaml @@ -0,0 +1,33 @@ +name: pandas-dev +channels: + - conda-forge +dependencies: + - python=3.11 + + # build dependencies + - versioneer[toml] + - meson[ninja]=1.0.1 + - meson-python=0.13.1 + + # test dependencies + - pytest>=7.3.2 + - pytest-cov + # Once pytest-cov > 4 comes out, unpin this + # Right now, a DeprecationWarning related to rsyncdir + # causes an InternalError within pytest + - pytest-xdist>=2.2.0, <3 + - hypothesis>=6.46.1 + - pytest-asyncio>=0.17.0 + + # pandas dependencies + - python-dateutil + - pytz + - pip + + - pip: + - "cython" + - "--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple" + - "--pre" + - "numpy" + - "scipy" + - "tzdata>=2022.1" diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index a55a33d020238..f24e866af0439 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -6,10 +6,12 @@ dependencies: # build dependencies - versioneer[toml] + - meson[ninja]=1.0.1 - cython>=0.29.33 + - meson-python=0.13.1 # test dependencies - - pytest>=7.0.0 + - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - hypothesis>=6.46.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 6da92a28965a2..9d60d734db5b3 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -7,12 +7,15 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 + - meson[ninja]=1.0.1 + - meson-python=0.13.1 # test dependencies - - pytest>=7.0.0 + - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies @@ -32,12 +35,12 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 - # - numba>=0.55.2 not compatible with 3.11 + - matplotlib>=3.6.1 + - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 - qtpy>=2.2.0 - - openpyxl<3.1.1, >=3.0.10 + - openpyxl>=3.0.10 - pandas-gbq>=0.17.5 - psycopg2>=2.9.3 - pyarrow>=7.0.0 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml deleted file mode 100644 index 86877c5f1c263..0000000000000 --- a/ci/deps/actions-38.yaml +++ /dev/null @@ -1,60 +0,0 @@ -name: pandas-dev -channels: - - conda-forge -dependencies: - - python=3.8 - - # build dependencies - - versioneer[toml] - - cython>=0.29.33 - - # test dependencies - - pytest>=7.0.0 - - pytest-cov - - pytest-xdist>=2.2.0 - - pytest-asyncio>=0.17.0 - - boto3 - - # required dependencies - - python-dateutil - - numpy - - pytz - - # optional dependencies - - beautifulsoup4>=4.11.1 - - blosc>=1.21.0 - - bottleneck>=1.3.4 - - brotlipy>=0.7.0 - - fastparquet>=0.8.1 - - fsspec>=2022.05.0 - - html5lib>=1.1 - - hypothesis>=6.46.1 - - gcsfs>=2022.05.0 - - jinja2>=3.1.2 - - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 - - numba>=0.55.2 - - numexpr>=2.8.0 - - odfpy>=1.4.1 - - qtpy>=2.2.0 - - openpyxl<3.1.1, >=3.0.10 - - pandas-gbq>=0.17.5 - - psycopg2>=2.9.3 - - pyarrow>=7.0.0 - - pymysql>=1.0.2 - - pyreadstat>=1.1.5 - - pytables>=3.7.0 - - python-snappy>=0.6.1 - - pyxlsb>=1.0.9 - - s3fs>=2022.05.0 - - scipy>=1.8.1 - - sqlalchemy>=1.4.36 - - tabulate>=0.8.10 - - xarray>=2022.03.0 - - xlrd>=2.0.1 - - xlsxwriter>=3.0.3 - - zstandard>=0.17.0 - - - pip: - - pyqt5>=5.15.6 - - tzdata>=2022.1 diff --git a/ci/deps/actions-38-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml similarity index 91% rename from ci/deps/actions-38-minimum_versions.yaml rename to ci/deps/actions-39-minimum_versions.yaml index 96c6a0fd6eb2e..91961e4af2d1c 100644 --- a/ci/deps/actions-38-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -4,17 +4,20 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8.0 + - python=3.9 # build dependencies - versioneer[toml] - cython>=0.29.33 + - meson[ninja]=1.0.1 + - meson-python=0.13.1 # test dependencies - - pytest>=7.0.0 + - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index bc89fa7bbb8b9..6ea0d41b947dc 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -7,12 +7,15 @@ dependencies: # build dependencies - versioneer[toml] - cython>=0.29.33 + - meson[ninja]=1.0.1 + - meson-python=0.13.1 # test dependencies - - pytest>=7.0.0 + - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies @@ -32,12 +35,12 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 + - matplotlib>=3.6.1 - numba>=0.55.2 - numexpr>=2.8.0 - odfpy>=1.4.1 - qtpy>=2.2.0 - - openpyxl<3.1.1, >=3.0.10 + - openpyxl>=3.0.10 - pandas-gbq>=0.17.5 - psycopg2>=2.9.3 - pyarrow>=7.0.0 diff --git a/ci/deps/actions-pypy-38.yaml b/ci/deps/actions-pypy-39.yaml similarity index 82% rename from ci/deps/actions-pypy-38.yaml rename to ci/deps/actions-pypy-39.yaml index 981399dcd4b7c..035395d55eb3a 100644 --- a/ci/deps/actions-pypy-38.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -5,14 +5,16 @@ dependencies: # TODO: Add the rest of the dependencies in here # once the other plentiful failures/segfaults # with base pandas has been dealt with - - python=3.8[build=*_pypy] # TODO: use this once pypy3.8 is available + - python=3.9[build=*_pypy] # build dependencies - versioneer[toml] - cython>=0.29.33 + - meson[ninja]=1.0.1 + - meson-python=0.13.1 # test dependencies - - pytest>=7.0.0 + - pytest>=7.3.2 - pytest-cov - pytest-asyncio>=0.17.0 - pytest-xdist>=2.2.0 @@ -22,6 +24,5 @@ dependencies: - numpy - python-dateutil - pytz - - pip: - tzdata>=2022.1 diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-310-arm64.yaml similarity index 80% rename from ci/deps/circle-38-arm64.yaml rename to ci/deps/circle-310-arm64.yaml index 85c4b82d55387..df4e8e285bd02 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -2,17 +2,20 @@ name: pandas-dev channels: - conda-forge dependencies: - - python=3.8 + - python=3.10 # build dependencies - versioneer[toml] - cython>=0.29.33 + - meson[ninja]=1.0.1 + - meson-python=0.13.1 # test dependencies - - pytest>=7.0.0 + - pytest>=7.3.2 - pytest-cov - pytest-xdist>=2.2.0 - pytest-asyncio>=0.17.0 + - pytest-localserver>=0.7.1 - boto3 # required dependencies @@ -32,12 +35,13 @@ dependencies: - gcsfs>=2022.05.0 - jinja2>=3.1.2 - lxml>=4.8.0 - - matplotlib>=3.6.1, <3.7.0 - - numba>=0.55.2 + - matplotlib>=3.6.1 + # test_numba_vs_cython segfaults with numba 0.57 + - numba>=0.55.2, <0.57.0 - numexpr>=2.8.0 - odfpy>=1.4.1 - qtpy>=2.2.0 - - openpyxl<3.1.1, >=3.0.10 + - openpyxl>=3.0.10 - pandas-gbq>=0.17.5 - psycopg2>=2.9.3 - pyarrow>=7.0.0 diff --git a/ci/fix_wheels.py b/ci/fix_wheels.py deleted file mode 100644 index 76b70fdde9ea0..0000000000000 --- a/ci/fix_wheels.py +++ /dev/null @@ -1,61 +0,0 @@ -""" -This file "repairs" our Windows wheels by copying the necessary DLLs for pandas to run -on a barebones Windows installation() into the wheel. - -NOTE: The paths for the DLLs are hard-coded to the location of the Visual Studio -redistributables -""" -import os -import shutil -import subprocess -from subprocess import CalledProcessError -import sys -import zipfile - -try: - if len(sys.argv) != 3: - raise ValueError( - "User must pass the path to the wheel and the destination directory." - ) - wheel_path = sys.argv[1] - dest_dir = sys.argv[2] - # Figure out whether we are building on 32 or 64 bit python - is_32 = sys.maxsize <= 2**32 - PYTHON_ARCH = "x86" if is_32 else "x64" -except ValueError: - # Too many/little values to unpack - raise ValueError( - "User must pass the path to the wheel and the destination directory." - ) -if not os.path.isdir(dest_dir): - print(f"Created directory {dest_dir}") - os.mkdir(dest_dir) - -wheel_name = os.path.basename(wheel_path) -success = True - -try: - # Use the wheel CLI for zipping up the wheel since the CLI will - # take care of rebuilding the hashes found in the record file - tmp_dir = os.path.join(dest_dir, "tmp") - with zipfile.ZipFile(wheel_path, "r") as f: - # Extracting all the members of the zip - # into a specific location. - f.extractall(path=tmp_dir) - base_redist_dir = ( - f"C:/Program Files (x86)/Microsoft Visual Studio/2019/" - f"Enterprise/VC/Redist/MSVC/14.29.30133/{PYTHON_ARCH}/" - f"Microsoft.VC142.CRT/" - ) - required_dlls = ["msvcp140.dll", "concrt140.dll"] - if not is_32: - required_dlls += ["vcruntime140_1.dll"] - dest_dll_dir = os.path.join(tmp_dir, "pandas/_libs/window") - for dll in required_dlls: - src = os.path.join(base_redist_dir, dll) - shutil.copy(src, dest_dll_dir) - subprocess.run(["wheel", "pack", tmp_dir, "-d", dest_dir], check=True) -except CalledProcessError: - print("Failed to add DLLS to wheel.") - sys.exit(1) -print("Successfully repaired wheel") diff --git a/ci/meta.yaml b/ci/meta.yaml new file mode 100644 index 0000000000000..09ae0d7253bf7 --- /dev/null +++ b/ci/meta.yaml @@ -0,0 +1,93 @@ +{% set version = "2.0.1" %} + +package: + name: pandas + version: {{ version }} + +source: + git_url: ../.. + +build: + number: 1 + script: + - export PYTHONUNBUFFERED=1 # [ppc64le] + - {{ PYTHON }} -m pip install -vv --no-deps --ignore-installed . # [not unix] + - {{ PYTHON }} -m pip install -vv --no-deps --ignore-installed . --global-option="build_ext" --global-option="-j4" --no-use-pep517 # [unix] + skip: true # [py<39] + +requirements: + build: + - python # [build_platform != target_platform] + - cross-python_{{ target_platform }} # [build_platform != target_platform] + - cython # [build_platform != target_platform] + - numpy # [build_platform != target_platform] + - {{ compiler('c') }} + - {{ compiler('cxx') }} + host: + - python + - pip + - setuptools >=61.0.0 + - cython >=0.29.33,<3 + - numpy >=1.21.6 # [py<311] + - numpy >=1.23.2 # [py>=311] + - versioneer + - tomli # [py<311] + run: + - python + - {{ pin_compatible('numpy') }} + - python-dateutil >=2.8.2 + - pytz >=2020.1 + - python-tzdata >=2022.1 + +test: + imports: + - pandas + commands: + - pip check + # Skip test suite on PyPy as it segfaults there + # xref: https://github.com/conda-forge/pandas-feedstock/issues/148 + # + # Also skip `test_rolling_var_numerical_issues` on `ppc64le` as it is a known test failure. + # xref: https://github.com/conda-forge/pandas-feedstock/issues/149 + {% set markers = ["not clipboard", "not single_cpu", "not db", "not network", "not slow"] %} + {% set markers = markers + ["not arm_slow"] %} # [aarch64 or ppc64le] + {% set extra_args = ["-n=2 -m " + " and ".join(markers)] %} + {% set tests_to_skip = "_not_a_real_test" %} + {% set tests_to_skip = tests_to_skip + " or test_rolling_var_numerical_issues" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_std_timedelta64_skipna_false" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_value_counts_normalized[M8[ns]]" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or test_to_datetime_format_YYYYMMDD_with_nat" %} # [ppc64le] + {% set tests_to_skip = tests_to_skip + " or (TestReductions and test_median_2d)" %} # [ppc64le] + {% set extra_args = extra_args + ["-k", "not (" + tests_to_skip + ")"] %} + - python -c "import pandas; pandas.test(extra_args={{ extra_args }})" # [python_impl == "cpython"] + requires: + - pip + - pytest >=7.3.2 + - pytest-asyncio >=0.17.0 + - pytest-xdist >=2.2.0 + - pytest-cov + - hypothesis >=6.46.1 + - tomli # [py<311] + +about: + home: http://pandas.pydata.org + license: BSD-3-Clause + license_file: LICENSE + summary: Powerful data structures for data analysis, time series, and statistics + doc_url: https://pandas.pydata.org/docs/ + dev_url: https://github.com/pandas-dev/pandas + +extra: + recipe-maintainers: + - jreback + - jorisvandenbossche + - msarahan + - ocefpaf + - TomAugspurger + - WillAyd + - simonjayhawkins + - mroeschke + - datapythonista + - phofl + - lithomas1 + - marcogorelli diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 90bacef920625..54e41ea449848 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -8,19 +8,9 @@ export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 429496 # May help reproduce flaky CI builds if set in subsequent runs echo PYTHONHASHSEED=$PYTHONHASHSEED -if [[ "not network" == *"$PATTERN"* ]]; then - export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; -fi - -COVERAGE="-s --cov=pandas --cov-report=xml --cov-append" - -# If no X server is found, we use xvfb to emulate it -if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then - export DISPLAY=":0" - XVFB="xvfb-run " -fi +COVERAGE="-s --cov=pandas --cov-report=xml --cov-append --cov-config=pyproject.toml" -PYTEST_CMD="${XVFB}pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +PYTEST_CMD="MESONPY_EDITABLE_VERBOSE=1 pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" diff --git a/ci/test_wheels.py b/ci/test_wheels.py deleted file mode 100644 index 75675d7e4ffc3..0000000000000 --- a/ci/test_wheels.py +++ /dev/null @@ -1,62 +0,0 @@ -import glob -import os -import shutil -import subprocess -from subprocess import CalledProcessError -import sys - -if os.name == "nt": - py_ver = f"{sys.version_info.major}.{sys.version_info.minor}" - is_32_bit = os.getenv("IS_32_BIT") == "true" - try: - wheel_dir = sys.argv[1] - wheel_path = glob.glob(f"{wheel_dir}/*.whl")[0] - except IndexError: - # Not passed - wheel_path = None - print(f"IS_32_BIT is {is_32_bit}") - print(f"Path to built wheel is {wheel_path}") - - print("Verifying file hashes in wheel RECORD file") - try: - tmp_dir = "tmp" - subprocess.run(["wheel", "unpack", wheel_path, "-d", tmp_dir], check=True) - except CalledProcessError: - print("wheel RECORD file hash verification failed.") - sys.exit(1) - finally: - shutil.rmtree(tmp_dir) - - if is_32_bit: - sys.exit(0) # No way to test Windows 32-bit(no docker image) - if wheel_path is None: - raise ValueError("Wheel path must be passed in if on 64-bit Windows") - print(f"Pulling docker image to test Windows 64-bit Python {py_ver}") - subprocess.run(f"docker pull python:{py_ver}-windowsservercore", check=True) - pandas_base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) - print(f"pandas project dir is {pandas_base_dir}") - dist_dir = os.path.join(pandas_base_dir, "dist") - print(f"Copying wheel into pandas_base_dir/dist ({dist_dir})") - os.mkdir(dist_dir) - shutil.copy(wheel_path, dist_dir) - print(os.listdir(dist_dir)) - subprocess.run( - rf"docker run -v %cd%:c:\pandas " - f"python:{py_ver}-windowsservercore /pandas/ci/test_wheels_windows.bat", - check=True, - shell=True, - cwd=pandas_base_dir, - ) -else: - import pandas as pd - - multi_args = [ - "-m not clipboard and not single_cpu and not slow and not network and not db", - "-n 2", - ] - pd.test(extra_args=multi_args) - pd.test( - extra_args=[ - "-m not clipboard and single_cpu and not slow and not network and not db", - ] - ) diff --git a/ci/test_wheels_windows.bat b/ci/test_wheels_windows.bat deleted file mode 100644 index 9864446d71137..0000000000000 --- a/ci/test_wheels_windows.bat +++ /dev/null @@ -1,9 +0,0 @@ -set test_command=import pandas as pd; print(pd.__version__); ^ -pd.test(extra_args=['-m not clipboard and not single_cpu and not slow and not network and not db', '-n 2']); ^ -pd.test(extra_args=['-m not clipboard and single_cpu and not slow and not network and not db']) - -python --version -pip install pytz six numpy python-dateutil tzdata>=2022.1 -pip install hypothesis>=6.46.1 pytest>=7.0.0 pytest-xdist>=2.2.0 pytest-asyncio>=0.17 -pip install --find-links=pandas/dist --no-index pandas -python -c "%test_command%" diff --git a/ci/upload_wheels.sh b/ci/upload_wheels.sh index f760621ea0e6b..3c4aa76c02003 100644 --- a/ci/upload_wheels.sh +++ b/ci/upload_wheels.sh @@ -10,7 +10,7 @@ set_upload_vars() { export ANACONDA_UPLOAD="true" elif [[ "$IS_SCHEDULE_DISPATCH" == "true" ]]; then echo scheduled or dispatched event - export ANACONDA_ORG="scipy-wheels-nightly" + export ANACONDA_ORG="scientific-python-nightly-wheels" export TOKEN="$PANDAS_NIGHTLY_UPLOAD_TOKEN" export ANACONDA_UPLOAD="true" else @@ -28,12 +28,12 @@ upload_wheels() { if compgen -G "./dist/*.gz"; then echo "Found sdist" anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./dist/*.gz - elif compgen -G "./wheelhouse/*.whl"; then + echo "Uploaded sdist" + fi + if compgen -G "./wheelhouse/*.whl"; then echo "Found wheel" anaconda -q -t ${TOKEN} upload --skip -u ${ANACONDA_ORG} ./wheelhouse/*.whl - else - echo "Files do not exist" - return 1 + echo "Uploaded wheel" fi echo "PyPI-style index: https://pypi.anaconda.org/$ANACONDA_ORG/simple" fi diff --git a/doc/data/tips.csv b/doc/data/tips.csv deleted file mode 100644 index 856a65a69e647..0000000000000 --- a/doc/data/tips.csv +++ /dev/null @@ -1,245 +0,0 @@ -total_bill,tip,sex,smoker,day,time,size -16.99,1.01,Female,No,Sun,Dinner,2 -10.34,1.66,Male,No,Sun,Dinner,3 -21.01,3.5,Male,No,Sun,Dinner,3 -23.68,3.31,Male,No,Sun,Dinner,2 -24.59,3.61,Female,No,Sun,Dinner,4 -25.29,4.71,Male,No,Sun,Dinner,4 -8.77,2.0,Male,No,Sun,Dinner,2 -26.88,3.12,Male,No,Sun,Dinner,4 -15.04,1.96,Male,No,Sun,Dinner,2 -14.78,3.23,Male,No,Sun,Dinner,2 -10.27,1.71,Male,No,Sun,Dinner,2 -35.26,5.0,Female,No,Sun,Dinner,4 -15.42,1.57,Male,No,Sun,Dinner,2 -18.43,3.0,Male,No,Sun,Dinner,4 -14.83,3.02,Female,No,Sun,Dinner,2 -21.58,3.92,Male,No,Sun,Dinner,2 -10.33,1.67,Female,No,Sun,Dinner,3 -16.29,3.71,Male,No,Sun,Dinner,3 -16.97,3.5,Female,No,Sun,Dinner,3 -20.65,3.35,Male,No,Sat,Dinner,3 -17.92,4.08,Male,No,Sat,Dinner,2 -20.29,2.75,Female,No,Sat,Dinner,2 -15.77,2.23,Female,No,Sat,Dinner,2 -39.42,7.58,Male,No,Sat,Dinner,4 -19.82,3.18,Male,No,Sat,Dinner,2 -17.81,2.34,Male,No,Sat,Dinner,4 -13.37,2.0,Male,No,Sat,Dinner,2 -12.69,2.0,Male,No,Sat,Dinner,2 -21.7,4.3,Male,No,Sat,Dinner,2 -19.65,3.0,Female,No,Sat,Dinner,2 -9.55,1.45,Male,No,Sat,Dinner,2 -18.35,2.5,Male,No,Sat,Dinner,4 -15.06,3.0,Female,No,Sat,Dinner,2 -20.69,2.45,Female,No,Sat,Dinner,4 -17.78,3.27,Male,No,Sat,Dinner,2 -24.06,3.6,Male,No,Sat,Dinner,3 -16.31,2.0,Male,No,Sat,Dinner,3 -16.93,3.07,Female,No,Sat,Dinner,3 -18.69,2.31,Male,No,Sat,Dinner,3 -31.27,5.0,Male,No,Sat,Dinner,3 -16.04,2.24,Male,No,Sat,Dinner,3 -17.46,2.54,Male,No,Sun,Dinner,2 -13.94,3.06,Male,No,Sun,Dinner,2 -9.68,1.32,Male,No,Sun,Dinner,2 -30.4,5.6,Male,No,Sun,Dinner,4 -18.29,3.0,Male,No,Sun,Dinner,2 -22.23,5.0,Male,No,Sun,Dinner,2 -32.4,6.0,Male,No,Sun,Dinner,4 -28.55,2.05,Male,No,Sun,Dinner,3 -18.04,3.0,Male,No,Sun,Dinner,2 -12.54,2.5,Male,No,Sun,Dinner,2 -10.29,2.6,Female,No,Sun,Dinner,2 -34.81,5.2,Female,No,Sun,Dinner,4 -9.94,1.56,Male,No,Sun,Dinner,2 -25.56,4.34,Male,No,Sun,Dinner,4 -19.49,3.51,Male,No,Sun,Dinner,2 -38.01,3.0,Male,Yes,Sat,Dinner,4 -26.41,1.5,Female,No,Sat,Dinner,2 -11.24,1.76,Male,Yes,Sat,Dinner,2 -48.27,6.73,Male,No,Sat,Dinner,4 -20.29,3.21,Male,Yes,Sat,Dinner,2 -13.81,2.0,Male,Yes,Sat,Dinner,2 -11.02,1.98,Male,Yes,Sat,Dinner,2 -18.29,3.76,Male,Yes,Sat,Dinner,4 -17.59,2.64,Male,No,Sat,Dinner,3 -20.08,3.15,Male,No,Sat,Dinner,3 -16.45,2.47,Female,No,Sat,Dinner,2 -3.07,1.0,Female,Yes,Sat,Dinner,1 -20.23,2.01,Male,No,Sat,Dinner,2 -15.01,2.09,Male,Yes,Sat,Dinner,2 -12.02,1.97,Male,No,Sat,Dinner,2 -17.07,3.0,Female,No,Sat,Dinner,3 -26.86,3.14,Female,Yes,Sat,Dinner,2 -25.28,5.0,Female,Yes,Sat,Dinner,2 -14.73,2.2,Female,No,Sat,Dinner,2 -10.51,1.25,Male,No,Sat,Dinner,2 -17.92,3.08,Male,Yes,Sat,Dinner,2 -27.2,4.0,Male,No,Thur,Lunch,4 -22.76,3.0,Male,No,Thur,Lunch,2 -17.29,2.71,Male,No,Thur,Lunch,2 -19.44,3.0,Male,Yes,Thur,Lunch,2 -16.66,3.4,Male,No,Thur,Lunch,2 -10.07,1.83,Female,No,Thur,Lunch,1 -32.68,5.0,Male,Yes,Thur,Lunch,2 -15.98,2.03,Male,No,Thur,Lunch,2 -34.83,5.17,Female,No,Thur,Lunch,4 -13.03,2.0,Male,No,Thur,Lunch,2 -18.28,4.0,Male,No,Thur,Lunch,2 -24.71,5.85,Male,No,Thur,Lunch,2 -21.16,3.0,Male,No,Thur,Lunch,2 -28.97,3.0,Male,Yes,Fri,Dinner,2 -22.49,3.5,Male,No,Fri,Dinner,2 -5.75,1.0,Female,Yes,Fri,Dinner,2 -16.32,4.3,Female,Yes,Fri,Dinner,2 -22.75,3.25,Female,No,Fri,Dinner,2 -40.17,4.73,Male,Yes,Fri,Dinner,4 -27.28,4.0,Male,Yes,Fri,Dinner,2 -12.03,1.5,Male,Yes,Fri,Dinner,2 -21.01,3.0,Male,Yes,Fri,Dinner,2 -12.46,1.5,Male,No,Fri,Dinner,2 -11.35,2.5,Female,Yes,Fri,Dinner,2 -15.38,3.0,Female,Yes,Fri,Dinner,2 -44.3,2.5,Female,Yes,Sat,Dinner,3 -22.42,3.48,Female,Yes,Sat,Dinner,2 -20.92,4.08,Female,No,Sat,Dinner,2 -15.36,1.64,Male,Yes,Sat,Dinner,2 -20.49,4.06,Male,Yes,Sat,Dinner,2 -25.21,4.29,Male,Yes,Sat,Dinner,2 -18.24,3.76,Male,No,Sat,Dinner,2 -14.31,4.0,Female,Yes,Sat,Dinner,2 -14.0,3.0,Male,No,Sat,Dinner,2 -7.25,1.0,Female,No,Sat,Dinner,1 -38.07,4.0,Male,No,Sun,Dinner,3 -23.95,2.55,Male,No,Sun,Dinner,2 -25.71,4.0,Female,No,Sun,Dinner,3 -17.31,3.5,Female,No,Sun,Dinner,2 -29.93,5.07,Male,No,Sun,Dinner,4 -10.65,1.5,Female,No,Thur,Lunch,2 -12.43,1.8,Female,No,Thur,Lunch,2 -24.08,2.92,Female,No,Thur,Lunch,4 -11.69,2.31,Male,No,Thur,Lunch,2 -13.42,1.68,Female,No,Thur,Lunch,2 -14.26,2.5,Male,No,Thur,Lunch,2 -15.95,2.0,Male,No,Thur,Lunch,2 -12.48,2.52,Female,No,Thur,Lunch,2 -29.8,4.2,Female,No,Thur,Lunch,6 -8.52,1.48,Male,No,Thur,Lunch,2 -14.52,2.0,Female,No,Thur,Lunch,2 -11.38,2.0,Female,No,Thur,Lunch,2 -22.82,2.18,Male,No,Thur,Lunch,3 -19.08,1.5,Male,No,Thur,Lunch,2 -20.27,2.83,Female,No,Thur,Lunch,2 -11.17,1.5,Female,No,Thur,Lunch,2 -12.26,2.0,Female,No,Thur,Lunch,2 -18.26,3.25,Female,No,Thur,Lunch,2 -8.51,1.25,Female,No,Thur,Lunch,2 -10.33,2.0,Female,No,Thur,Lunch,2 -14.15,2.0,Female,No,Thur,Lunch,2 -16.0,2.0,Male,Yes,Thur,Lunch,2 -13.16,2.75,Female,No,Thur,Lunch,2 -17.47,3.5,Female,No,Thur,Lunch,2 -34.3,6.7,Male,No,Thur,Lunch,6 -41.19,5.0,Male,No,Thur,Lunch,5 -27.05,5.0,Female,No,Thur,Lunch,6 -16.43,2.3,Female,No,Thur,Lunch,2 -8.35,1.5,Female,No,Thur,Lunch,2 -18.64,1.36,Female,No,Thur,Lunch,3 -11.87,1.63,Female,No,Thur,Lunch,2 -9.78,1.73,Male,No,Thur,Lunch,2 -7.51,2.0,Male,No,Thur,Lunch,2 -14.07,2.5,Male,No,Sun,Dinner,2 -13.13,2.0,Male,No,Sun,Dinner,2 -17.26,2.74,Male,No,Sun,Dinner,3 -24.55,2.0,Male,No,Sun,Dinner,4 -19.77,2.0,Male,No,Sun,Dinner,4 -29.85,5.14,Female,No,Sun,Dinner,5 -48.17,5.0,Male,No,Sun,Dinner,6 -25.0,3.75,Female,No,Sun,Dinner,4 -13.39,2.61,Female,No,Sun,Dinner,2 -16.49,2.0,Male,No,Sun,Dinner,4 -21.5,3.5,Male,No,Sun,Dinner,4 -12.66,2.5,Male,No,Sun,Dinner,2 -16.21,2.0,Female,No,Sun,Dinner,3 -13.81,2.0,Male,No,Sun,Dinner,2 -17.51,3.0,Female,Yes,Sun,Dinner,2 -24.52,3.48,Male,No,Sun,Dinner,3 -20.76,2.24,Male,No,Sun,Dinner,2 -31.71,4.5,Male,No,Sun,Dinner,4 -10.59,1.61,Female,Yes,Sat,Dinner,2 -10.63,2.0,Female,Yes,Sat,Dinner,2 -50.81,10.0,Male,Yes,Sat,Dinner,3 -15.81,3.16,Male,Yes,Sat,Dinner,2 -7.25,5.15,Male,Yes,Sun,Dinner,2 -31.85,3.18,Male,Yes,Sun,Dinner,2 -16.82,4.0,Male,Yes,Sun,Dinner,2 -32.9,3.11,Male,Yes,Sun,Dinner,2 -17.89,2.0,Male,Yes,Sun,Dinner,2 -14.48,2.0,Male,Yes,Sun,Dinner,2 -9.6,4.0,Female,Yes,Sun,Dinner,2 -34.63,3.55,Male,Yes,Sun,Dinner,2 -34.65,3.68,Male,Yes,Sun,Dinner,4 -23.33,5.65,Male,Yes,Sun,Dinner,2 -45.35,3.5,Male,Yes,Sun,Dinner,3 -23.17,6.5,Male,Yes,Sun,Dinner,4 -40.55,3.0,Male,Yes,Sun,Dinner,2 -20.69,5.0,Male,No,Sun,Dinner,5 -20.9,3.5,Female,Yes,Sun,Dinner,3 -30.46,2.0,Male,Yes,Sun,Dinner,5 -18.15,3.5,Female,Yes,Sun,Dinner,3 -23.1,4.0,Male,Yes,Sun,Dinner,3 -15.69,1.5,Male,Yes,Sun,Dinner,2 -19.81,4.19,Female,Yes,Thur,Lunch,2 -28.44,2.56,Male,Yes,Thur,Lunch,2 -15.48,2.02,Male,Yes,Thur,Lunch,2 -16.58,4.0,Male,Yes,Thur,Lunch,2 -7.56,1.44,Male,No,Thur,Lunch,2 -10.34,2.0,Male,Yes,Thur,Lunch,2 -43.11,5.0,Female,Yes,Thur,Lunch,4 -13.0,2.0,Female,Yes,Thur,Lunch,2 -13.51,2.0,Male,Yes,Thur,Lunch,2 -18.71,4.0,Male,Yes,Thur,Lunch,3 -12.74,2.01,Female,Yes,Thur,Lunch,2 -13.0,2.0,Female,Yes,Thur,Lunch,2 -16.4,2.5,Female,Yes,Thur,Lunch,2 -20.53,4.0,Male,Yes,Thur,Lunch,4 -16.47,3.23,Female,Yes,Thur,Lunch,3 -26.59,3.41,Male,Yes,Sat,Dinner,3 -38.73,3.0,Male,Yes,Sat,Dinner,4 -24.27,2.03,Male,Yes,Sat,Dinner,2 -12.76,2.23,Female,Yes,Sat,Dinner,2 -30.06,2.0,Male,Yes,Sat,Dinner,3 -25.89,5.16,Male,Yes,Sat,Dinner,4 -48.33,9.0,Male,No,Sat,Dinner,4 -13.27,2.5,Female,Yes,Sat,Dinner,2 -28.17,6.5,Female,Yes,Sat,Dinner,3 -12.9,1.1,Female,Yes,Sat,Dinner,2 -28.15,3.0,Male,Yes,Sat,Dinner,5 -11.59,1.5,Male,Yes,Sat,Dinner,2 -7.74,1.44,Male,Yes,Sat,Dinner,2 -30.14,3.09,Female,Yes,Sat,Dinner,4 -12.16,2.2,Male,Yes,Fri,Lunch,2 -13.42,3.48,Female,Yes,Fri,Lunch,2 -8.58,1.92,Male,Yes,Fri,Lunch,1 -15.98,3.0,Female,No,Fri,Lunch,3 -13.42,1.58,Male,Yes,Fri,Lunch,2 -16.27,2.5,Female,Yes,Fri,Lunch,2 -10.09,2.0,Female,Yes,Fri,Lunch,2 -20.45,3.0,Male,No,Sat,Dinner,4 -13.28,2.72,Male,No,Sat,Dinner,2 -22.12,2.88,Female,Yes,Sat,Dinner,2 -24.01,2.0,Male,Yes,Sat,Dinner,4 -15.69,3.0,Male,Yes,Sat,Dinner,3 -11.61,3.39,Male,No,Sat,Dinner,2 -10.77,1.47,Male,No,Sat,Dinner,2 -15.53,3.0,Male,Yes,Sat,Dinner,2 -10.07,1.25,Male,No,Sat,Dinner,2 -12.6,1.0,Male,Yes,Sat,Dinner,2 -32.83,1.17,Male,Yes,Sat,Dinner,2 -35.83,4.67,Female,No,Sat,Dinner,3 -29.03,5.92,Male,No,Sat,Dinner,3 -27.18,2.0,Female,Yes,Sat,Dinner,2 -22.67,2.0,Male,Yes,Sat,Dinner,2 -17.82,1.75,Male,No,Sat,Dinner,2 -18.78,3.0,Female,No,Thur,Dinner,2 diff --git a/doc/make.py b/doc/make.py index f5bf170c6274d..937b2638fb098 100755 --- a/doc/make.py +++ b/doc/make.py @@ -163,12 +163,12 @@ def _get_page_title(self, page): components=(docutils.parsers.rst.Parser,) ) doc = docutils.utils.new_document("", option_parser.get_default_values()) - with open(fname) as f: + with open(fname, encoding="utf-8") as f: data = f.read() parser = docutils.parsers.rst.Parser() # do not generate any warning when parsing the rst - with open(os.devnull, "a") as f: + with open(os.devnull, "a", encoding="utf-8") as f: doc.reporter.stream = f parser.parse(data, doc) @@ -186,7 +186,7 @@ def _add_redirects(self): Create in the build directory an html file with a redirect, for every row in REDIRECTS_FILE. """ - with open(REDIRECTS_FILE) as mapping_fd: + with open(REDIRECTS_FILE, encoding="utf-8") as mapping_fd: reader = csv.reader(mapping_fd) for row in reader: if not row or row[0].strip().startswith("#"): @@ -209,7 +209,7 @@ def _add_redirects(self): # sphinx specific stuff title = "this page" - with open(path, "w") as moved_page_fd: + with open(path, "w", encoding="utf-8") as moved_page_fd: html = f"""\ @@ -321,7 +321,7 @@ def main(): help=( "filename (relative to the 'source' folder) of section or method name to " "compile, e.g. 'development/contributing.rst', " - "'ecosystem.rst', 'pandas.DataFrame.join'" + "'pandas.DataFrame.join'" ), ) argparser.add_argument( diff --git a/doc/source/_static/css/getting_started.css b/doc/source/_static/css/getting_started.css index e4c5964259349..0d53bbde94ae3 100644 --- a/doc/source/_static/css/getting_started.css +++ b/doc/source/_static/css/getting_started.css @@ -138,15 +138,21 @@ ul.task-bullet > li > p:first-child { margin: 0px; } -.comparison-card .card-img-top { +.comparison-card .sd-card-img-top { margin: 10px; margin-bottom: 20px; - height: 72px; + height: 52px; background: none !important; } -.comparison-card-excel .card-img-top, .comparison-card-stata .card-img-top, .comparison-card-sas .card-img-top { - height: 52px; +.comparison-card .sd-btn-secondary { + background-color: #6c757d !important; + border-color: #6c757d !important; +} + +.comparison-card .sd-btn-secondary:hover { + background-color: #5a6268 !important; + border-color: #545b62 !important; } .comparison-card .card-footer { @@ -178,6 +184,16 @@ ul.task-bullet > li > p:first-child { margin: 0 1em 1em 1em; } +.install-card .sd-btn-secondary { + background-color: #6c757d !important; + border-color: #6c757d !important; +} + +.install-card .sd-btn-secondary:hover { + background-color: #5a6268 !important; + border-color: #545b62 !important; +} + .custom-button { background-color:#DCDCDC; border: none; diff --git a/doc/source/_static/css/pandas.css b/doc/source/_static/css/pandas.css index c32a9c8f40ff5..1145177898737 100644 --- a/doc/source/_static/css/pandas.css +++ b/doc/source/_static/css/pandas.css @@ -12,42 +12,41 @@ table { /* Main index page overview cards */ .intro-card { - background: #fff; - border-radius: 0; padding: 30px 10px 20px 10px; - margin: 10px 0px; } -.intro-card p.card-text { - margin: 0px; -} - -.intro-card .card-img-top { +.intro-card .sd-card-img-top { margin: 10px; height: 52px; background: none !important; } -.intro-card .card-header { - border: none; - background-color: transparent; - color: #150458 !important; +.intro-card .sd-card-title { + color: var(--pst-color-primary); font-size: var(--pst-font-size-h5); - font-weight: bold; - padding: 2.5rem 0rem 0.5rem 0rem; + padding: 1rem 0rem 0.5rem 0rem; } -.intro-card .card-footer { - border: none; - background-color: transparent; +.intro-card .sd-card-footer { + border: none !important; } -.intro-card .card-footer p.card-text{ +.intro-card .sd-card-footer p.sd-card-text { max-width: 220px; margin-left: auto; margin-right: auto; } +.intro-card .sd-btn-secondary { + background-color: #6c757d !important; + border-color: #6c757d !important; +} + +.intro-card .sd-btn-secondary:hover { + background-color: #5a6268 !important; + border-color: #545b62 !important; +} + .card, .card img { background-color: var(--pst-color-background); } diff --git a/doc/source/conf.py b/doc/source/conf.py index c73a91aa90365..31893bdf929d8 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -56,7 +56,7 @@ "matplotlib.sphinxext.plot_directive", "numpydoc", "sphinx_copybutton", - "sphinx_panels", + "sphinx_design", "sphinx_toggleprompt", "sphinx.ext.autodoc", "sphinx.ext.autosummary", @@ -76,6 +76,7 @@ # to ensure that include files (partial pages) aren't built, exclude them # https://github.com/sphinx-doc/sphinx/issues/1965#issuecomment-124732907 "**/includes/**", + "**/api/pandas.Series.dt.rst", ] try: import nbconvert @@ -116,9 +117,9 @@ elif single_doc and rel_fname != pattern: exclude_patterns.append(rel_fname) -with open(os.path.join(source_path, "index.rst.template")) as f: +with open(os.path.join(source_path, "index.rst.template"), encoding="utf-8") as f: t = jinja2.Template(f.read()) -with open(os.path.join(source_path, "index.rst"), "w") as f: +with open(os.path.join(source_path, "index.rst"), "w", encoding="utf-8") as f: f.write( t.render( include_api=include_api, @@ -142,10 +143,6 @@ # nbsphinx do not use requirejs (breaks bootstrap) nbsphinx_requirejs_path = "" -# sphinx-panels shouldn't add bootstrap css since the pydata-sphinx-theme -# already loads it -panels_add_bootstrap_css = False - # https://sphinx-toggleprompt.readthedocs.io/en/stable/#offset toggleprompt_offset_right = 35 @@ -240,16 +237,23 @@ html_theme_options = { "external_links": [], - "footer_items": ["pandas_footer", "sphinx-version"], + "footer_start": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", "twitter_url": "https://twitter.com/pandas_dev", - "google_analytics_id": "UA-27880019-2", + "analytics": {"google_analytics_id": "G-5RE31C1RNW"}, "logo": {"image_dark": "https://pandas.pydata.org/static/img/pandas_white.svg"}, "navbar_end": ["version-switcher", "theme-switcher", "navbar-icon-links"], "switcher": { - "json_url": "/versions.json", + "json_url": "https://pandas.pydata.org/versions.json", "version_match": switcher_version, }, + "icon_links": [ + { + "name": "Mastodon", + "url": "https://fosstodon.org/@pandas_dev", + "icon": "fa-brands fa-mastodon", + }, + ], } # Add any paths that contain custom themes here, relative to this directory. @@ -459,11 +463,10 @@ # extlinks alias extlinks = { - "issue": ("https://github.com/pandas-dev/pandas/issues/%s", "GH"), + "issue": ("https://github.com/pandas-dev/pandas/issues/%s", "GH %s"), } -ipython_warning_is_error = False ipython_execlines = [ "import numpy as np", "import pandas as pd", diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index ea69f0b907d8b..4b9a6ba1e069c 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -119,6 +119,22 @@ Some great resources for learning Git: * the `NumPy documentation `_. * Matthew Brett's `Pydagogue `_. +Also, the project follows a forking workflow further described on this page whereby +contributors fork the repository, make changes and then create a pull request. +So please be sure to read and follow all the instructions in this guide. + +If you are new to contributing to projects through forking on GitHub, +take a look at the `GitHub documentation for contributing to projects `_. +GitHub provides a quick tutorial using a test repository that may help you become more familiar +with forking a repository, cloning a fork, creating a feature branch, pushing changes and +making pull requests. + +Below are some useful resources for learning more about forking and pull requests on GitHub: + +* the `GitHub documentation for forking a repo `_. +* the `GitHub documentation for collaborating with pull requests `_. +* the `GitHub documentation for working with forks `_. + Getting started with Git ------------------------ diff --git a/doc/source/development/contributing_codebase.rst b/doc/source/development/contributing_codebase.rst index e99dbbde3db85..f3ff5b70d4aac 100644 --- a/doc/source/development/contributing_codebase.rst +++ b/doc/source/development/contributing_codebase.rst @@ -612,23 +612,17 @@ deleted when the context block is exited. Testing involving network connectivity ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -It is highly discouraged to add a test that connects to the internet due to flakiness of network connections and -lack of ownership of the server that is being connected to. If network connectivity is absolutely required, use the -``tm.network`` decorator. +A unit test should not access a public data set over the internet due to flakiness of network connections and +lack of ownership of the server that is being connected to. To mock this interaction, use the ``httpserver`` fixture from the +`pytest-localserver plugin. `_ with synthetic data. .. code-block:: python - @tm.network # noqa - def test_network(): - result = package.call_to_internet() - -If the test requires data from a specific website, specify ``check_before_test=True`` and the site in the decorator. - -.. code-block:: python - - @tm.network("https://www.somespecificsite.com", check_before_test=True) - def test_network(): - result = pd.read_html("https://www.somespecificsite.com") + @pytest.mark.network + @pytest.mark.single_cpu + def test_network(httpserver): + httpserver.serve_content(content="content") + result = pd.read_html(httpserver.url) Example ^^^^^^^ @@ -770,7 +764,7 @@ install pandas) by typing:: your installation is probably fine and you can start contributing! Often it is worth running only a subset of tests first around your changes before running the -entire suite (tip: you can use the [pandas-coverage app](https://pandas-coverage.herokuapp.com/) +entire suite (tip: you can use the [pandas-coverage app](https://pandas-coverage-12d2130077bc.herokuapp.com/)) to find out which tests hit the lines of code you've modified, and then run only those). The easiest way to do this is with:: @@ -867,7 +861,7 @@ performance regressions. pandas is in the process of migrating to `asv benchmarks `__ to enable easy monitoring of the performance of critical pandas operations. These benchmarks are all found in the ``pandas/asv_bench`` directory, and the -test results can be found `here `__. +test results can be found `here `__. To use all features of asv, you will need either ``conda`` or ``virtualenv``. For more details please check the `asv installation @@ -959,9 +953,9 @@ directive is used. The sphinx syntax for that is: .. code-block:: rst - .. versionadded:: 1.1.0 + .. versionadded:: 2.1.0 -This will put the text *New in version 1.1.0* wherever you put the sphinx +This will put the text *New in version 2.1.0* wherever you put the sphinx directive. This should also be put in the docstring when adding a new function or method (`example `__) or a new keyword argument (`example `__). diff --git a/doc/source/development/contributing_docstring.rst b/doc/source/development/contributing_docstring.rst index 6524e4da2299d..87aecb6936c9c 100644 --- a/doc/source/development/contributing_docstring.rst +++ b/doc/source/development/contributing_docstring.rst @@ -652,9 +652,9 @@ A simple example could be: Examples -------- - >>> s = pd.Series(['Ant', 'Bear', 'Cow', 'Dog', 'Falcon', + >>> ser = pd.Series(['Ant', 'Bear', 'Cow', 'Dog', 'Falcon', ... 'Lion', 'Monkey', 'Rabbit', 'Zebra']) - >>> s.head() + >>> ser.head() 0 Ant 1 Bear 2 Cow @@ -664,7 +664,7 @@ A simple example could be: With the ``n`` parameter, we can change the number of returned rows: - >>> s.head(n=3) + >>> ser.head(n=3) 0 Ant 1 Bear 2 Cow @@ -695,10 +695,10 @@ and avoiding aliases. Avoid excessive imports, but if needed, imports from the standard library go first, followed by third-party libraries (like matplotlib). -When illustrating examples with a single ``Series`` use the name ``s``, and if +When illustrating examples with a single ``Series`` use the name ``ser``, and if illustrating with a single ``DataFrame`` use the name ``df``. For indices, ``idx`` is the preferred name. If a set of homogeneous ``Series`` or -``DataFrame`` is used, name them ``s1``, ``s2``, ``s3``... or ``df1``, +``DataFrame`` is used, name them ``ser1``, ``ser2``, ``ser3``... or ``df1``, ``df2``, ``df3``... If the data is not homogeneous, and more than one structure is needed, name them with something meaningful, for example ``df_main`` and ``df_to_join``. @@ -731,8 +731,8 @@ positional arguments ``head(3)``. Examples -------- - >>> s = pd.Series([1, 2, 3]) - >>> s.mean() + >>> ser = pd.Series([1, 2, 3]) + >>> ser.mean() 2 """ pass @@ -744,8 +744,8 @@ positional arguments ``head(3)``. Examples -------- - >>> s = pd.Series([1, np.nan, 3]) - >>> s.fillna(0) + >>> ser = pd.Series([1, np.nan, 3]) + >>> ser.fillna(0) [1, 0, 3] """ pass @@ -756,10 +756,10 @@ positional arguments ``head(3)``. Examples -------- - >>> s = pd.Series([380., 370., 24., 26], + >>> ser = pd.Series([380., 370., 24., 26], ... name='max_speed', ... index=['falcon', 'falcon', 'parrot', 'parrot']) - >>> s.groupby_mean() + >>> ser.groupby_mean() index falcon 375.0 parrot 25.0 @@ -776,8 +776,8 @@ positional arguments ``head(3)``. Examples -------- - >>> s = pd.Series('Antelope', 'Lion', 'Zebra', np.nan) - >>> s.contains(pattern='a') + >>> ser = pd.Series('Antelope', 'Lion', 'Zebra', np.nan) + >>> ser.contains(pattern='a') 0 False 1 False 2 True @@ -800,7 +800,7 @@ positional arguments ``head(3)``. We can fill missing values in the output using the ``na`` parameter: - >>> s.contains(pattern='a', na=False) + >>> ser.contains(pattern='a', na=False) 0 False 1 False 2 True @@ -920,8 +920,8 @@ plot will be generated automatically when building the documentation. .. plot:: :context: close-figs - >>> s = pd.Series([1, 2, 3]) - >>> s.plot() + >>> ser = pd.Series([1, 2, 3]) + >>> ser.plot() """ pass diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 858c3322a14b0..51d0edf1859c5 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -207,13 +207,59 @@ for :ref:`building pandas with GitPod `. Step 3: build and install pandas -------------------------------- -You can now run:: +There are currently two supported ways of building pandas, pip/meson and setuptools(setup.py). +Historically, pandas has only supported using setuptools to build pandas. However, this method +requires a lot of convoluted code in setup.py and also has many issues in compiling pandas in parallel +due to limitations in setuptools. + +The newer build system, invokes the meson backend through pip (via a `PEP 517 `_ build). +It automatically uses all available cores on your CPU, and also avoids the need for manual rebuilds by +rebuilding automatically whenever pandas is imported(with an editable install). + +For these reasons, you should compile pandas with meson. +Because the meson build system is newer, you may find bugs/minor issues as it matures. You can report these bugs +`here `_. + +To compile pandas with meson, run:: # Build and install pandas - # The number after -j is the number of compiling jobs run in parallel - # Change it according to your machine's hardware spec - python setup.py build_ext -j 4 - python -m pip install -e . --no-build-isolation --no-use-pep517 + # By default, this will print verbose output + # showing the "rebuild" taking place on import (see section below for explanation) + # If you do not want to see this, omit everything after --no-build-isolation + python -m pip install -ve . --no-build-isolation --config-settings editable-verbose=true + +**Build options** + +It is possible to pass options from the pip frontend to the meson backend if you would like to configure your +install. Occasionally, you'll want to use this to adjust the build directory, and/or toggle debug/optimization levels. + +You can pass a build directory to pandas by appending ``--config-settings builddir="your builddir here"`` to your pip command. +This option allows you to configure where meson stores your built C extensions, and allows for fast rebuilds. + +Sometimes, it might be useful to compile pandas with debugging symbols, when debugging C extensions. +Appending ``--config-settings setup-args="-Ddebug=true"`` will do the trick. + +With pip, it is possible to chain together multiple config settings (for example specifying both a build directory +and building with debug symbols would look like +``--config-settings builddir="your builddir here" --config-settings=setup-args="-Dbuildtype=debug"``. + +**Compiling pandas with setup.py** + +.. note:: + This method of compiling pandas will be deprecated and removed very soon, as the meson backend matures. + +To compile pandas with setuptools, run:: + + python setup.py develop + +.. note:: + If pandas is already installed (via meson), you have to uninstall it first:: + + python -m pip uninstall pandas + +This is because python setup.py develop will not uninstall the loader script that ``meson-python`` +uses to import the extension from the build folder, which may cause errors such as an +``FileNotFoundError`` to be raised. .. note:: You will need to repeat this step each time the C extensions change, for example @@ -226,5 +272,22 @@ At this point you should be able to import pandas from your locally built versio >>> print(pandas.__version__) # note: the exact output may differ 2.0.0.dev0+880.g2b9e661fbb.dirty -This will create the new environment, and not touch any of your existing environments, -nor any existing Python installation. +When building pandas with meson, importing pandas will automatically trigger a rebuild, even when C/Cython files are modified. +By default, no output will be produced by this rebuild (the import will just take longer). If you would like to see meson's +output when importing pandas, you can set the environment variable ``MESONPY_EDTIABLE_VERBOSE``. For example, this would be:: + + # On Linux/macOS + MESONPY_EDITABLE_VERBOSE=1 python + + # Windows + set MESONPY_EDITABLE_VERBOSE=1 # Only need to set this once per session + python + +If you would like to see this verbose output every time, you can set the ``editable-verbose`` config setting to ``true`` like so:: + + python -m pip install -ve . --config-settings editable-verbose=true + +.. tip:: + If you ever find yourself wondering whether setuptools or meson was used to build your pandas, + you can check the value of ``pandas._built_with_meson``, which will be true if meson was used + to compile pandas. diff --git a/doc/source/development/contributing_gitpod.rst b/doc/source/development/contributing_gitpod.rst index c591be5425db9..042a2f316cd42 100644 --- a/doc/source/development/contributing_gitpod.rst +++ b/doc/source/development/contributing_gitpod.rst @@ -29,7 +29,7 @@ you do not have an account yet, you will need to create one first. To get started just login at `Gitpod`_, and grant the appropriate permissions to GitHub. -We have built a python 3.8 environment and all development dependencies will +We have built a python 3.10 environment and all development dependencies will install when the environment starts. diff --git a/doc/source/development/debugging_extensions.rst b/doc/source/development/debugging_extensions.rst index 8fa9c94ae96c9..9ac4cf4083475 100644 --- a/doc/source/development/debugging_extensions.rst +++ b/doc/source/development/debugging_extensions.rst @@ -13,3 +13,21 @@ For Python developers with limited or no C/C++ experience this can seem a daunti 1. `Fundamental Python Debugging Part 1 - Python `_ 2. `Fundamental Python Debugging Part 2 - Python Extensions `_ 3. `Fundamental Python Debugging Part 3 - Cython Extensions `_ + +Generating debug builds +----------------------- + +By default building pandas from source will generate a release build. To generate a development build you can type:: + + pip install -ve . --no-build-isolation --config-settings=builddir="debug" --config-settings=setup-args="-Dbuildtype=debug" + +By specifying ``builddir="debug"`` all of the targets will be built and placed in the debug directory relative to the project root. This helps to keep your debug and release artifacts separate; you are of course able to choose a different directory name or omit altogether if you do not care to separate build types. + +Editor support +-------------- + +The meson build system generates a `compilation database `_ automatically and places it in the build directory. Many language servers and IDEs can use this information to provide code-completion, go-to-defintion and error checking support as you type. + +How each language server / IDE chooses to look for the compilation database may vary. When in doubt you may want to create a symlink at the root of the project that points to the compilation database in your build directory. Assuming you used *debug* as your directory name, you can run:: + + ln -s debug/compile_commands.json . diff --git a/doc/source/development/extending.rst b/doc/source/development/extending.rst index b829cfced6962..f74eacb6b861d 100644 --- a/doc/source/development/extending.rst +++ b/doc/source/development/extending.rst @@ -60,7 +60,7 @@ Now users can access your methods using the ``geo`` namespace: This can be a convenient way to extend pandas objects without subclassing them. If you write a custom accessor, make a pull request adding it to our -:ref:`ecosystem` page. +`ecosystem `_ page. We highly recommend validating the data in your accessor's ``__init__``. In our ``GeoAccessor``, we validate that the data contains the expected columns, @@ -91,7 +91,7 @@ objects). Many methods like :func:`pandas.isna` will dispatch to the extension type's implementation. If you're building a library that implements the interface, please publicize it -on :ref:`ecosystem.extensions`. +on `the ecosystem page `_. The interface consists of two classes. diff --git a/doc/source/development/maintaining.rst b/doc/source/development/maintaining.rst index b8a39c0c91236..a38e6c13dea41 100644 --- a/doc/source/development/maintaining.rst +++ b/doc/source/development/maintaining.rst @@ -373,11 +373,14 @@ Prerequisites In order to be able to release a new pandas version, the next permissions are needed: -- Merge rights to the `pandas `_, - `pandas-wheels `_, and +- Merge rights to the `pandas `_ and `pandas-feedstock `_ repositories. -- Permissions to push to main in the pandas repository, to push the new tags. -- `Write permissions to PyPI `_ + For the latter, open a PR adding your GitHub username to the conda-forge recipe. +- Permissions to push to ``main`` in the pandas repository, to push the new tags. +- `Write permissions to PyPI `_. +- Access to our website / documentation server. Share your public key with the + infrastructure committee to be added to the ``authorized_keys`` file of the main + server user. - Access to the social media accounts, to publish the announcements. Pre-release @@ -443,10 +446,10 @@ which will be triggered when the tag is pushed. 4. Create a `new GitHub release `_: - - Title: ``Pandas `` - Tag: ```` - - Files: ``pandas-.tar.gz`` source distribution just generated + - Title: ``Pandas `` - Description: Copy the description of the last release of the same kind (release candidate, major/minor or patch release) + - Files: ``pandas-.tar.gz`` source distribution just generated - Set as a pre-release: Only check for a release candidate - Set as the latest release: Leave checked, unless releasing a patch release for an older version (e.g. releasing 1.4.5 after 1.5 has been released) @@ -454,6 +457,9 @@ which will be triggered when the tag is pushed. 5. The GitHub release will after some hours trigger an `automated conda-forge PR `_. Merge it once the CI is green, and it will generate the conda-forge packages. + In case a manual PR needs to be done, the version, sha256 and build fields are the + ones that usually need to be changed. If anything else in the recipe has changed since + the last release, those changes should be available in ``ci/meta.yaml``. 6. Packages for supported versions in PyPI are built automatically from our CI. Once all packages are build download all wheels from the @@ -468,8 +474,16 @@ which will be triggered when the tag is pushed. Post-Release ```````````` -1. Update symlink to stable documentation by logging in to our web server, and - editing ``/var/www/html/pandas-docs/stable`` to point to ``version/``. +1. Update symlinks to stable documentation by logging in to our web server, and + editing ``/var/www/html/pandas-docs/stable`` to point to ``version/`` + for major and minor releases, or ``version/`` to ``version/`` for + patch releases. The exact instructions are (replace the example version numbers by + the appropriate ones for the version you are releasing): + + - Log in to the server and use the correct user. + - `cd /var/www/html/pandas-docs/` + - `ln -sfn version/2.1 stable` (for a major or minor release) + - `ln -sfn version/2.0.3 version/2.0` (for a patch release) 2. If releasing a major or minor release, open a PR in our source code to update ``web/pandas/versions.json``, to have the desired versions in the documentation @@ -481,13 +495,16 @@ Post-Release 5. Open a PR with the placeholder for the release notes of the next version. See for example `the PR for 1.5.3 `_. + Note that the template to use depends on whether it is a major, minor or patch release. 6. Announce the new release in the official channels (use previous announcements for reference): - The pandas-dev and pydata mailing lists - - Twitter, Mastodon and Telegram + - Twitter, Mastodon, Telegram and LinkedIn +7. Update this release instructions to fix anything incorrect and to update about any + change since the last release. .. _governance documents: https://github.com/pandas-dev/pandas/blob/main/web/pandas/about/governance.md .. _list of permissions: https://docs.github.com/en/organizations/managing-access-to-your-organizations-repositories/repository-roles-for-an-organization diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst deleted file mode 100644 index 165be2a092535..0000000000000 --- a/doc/source/ecosystem.rst +++ /dev/null @@ -1,619 +0,0 @@ -:orphan: - -.. _ecosystem: - -{{ header }} - -**************** -pandas ecosystem -**************** - -Increasingly, packages are being built on top of pandas to address specific needs -in data preparation, analysis and visualization. -This is encouraging because it means pandas is not only helping users to handle -their data tasks but also that it provides a better starting point for developers to -build powerful and more focused data tools. -The creation of libraries that complement pandas' functionality also allows pandas -development to remain focused around it's original requirements. - -This is an inexhaustive list of projects that build on pandas in order to provide -tools in the PyData space. For a list of projects that depend on pandas, -see the -`GitHub network dependents for pandas `_ -or `search pypi for pandas `_. - -We'd like to make it easier for users to find these projects, if you know of other -substantial projects that you feel should be on this list, please let us know. - -.. _ecosystem.data_cleaning_and_validation: - -Data cleaning and validation ----------------------------- - -`Pyjanitor `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Pyjanitor provides a clean API for cleaning data, using method chaining. - -`Pandera `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Pandera provides a flexible and expressive API for performing data validation on dataframes -to make data processing pipelines more readable and robust. -Dataframes contain information that pandera explicitly validates at runtime. This is useful in -production-critical data pipelines or reproducible research settings. - -`pandas-path `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Since Python 3.4, `pathlib `_ has been -included in the Python standard library. Path objects provide a simple -and delightful way to interact with the file system. The pandas-path package enables the -Path API for pandas through a custom accessor ``.path``. Getting just the filenames from -a series of full file paths is as simple as ``my_files.path.name``. Other convenient operations like -joining paths, replacing file extensions, and checking if files exist are also available. - -.. _ecosystem.stats: - -Statistics and machine learning -------------------------------- - -`pandas-tfrecords `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Easy saving pandas dataframe to tensorflow tfrecords format and reading tfrecords to pandas. - -`Statsmodels `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Statsmodels is the prominent Python "statistics and econometrics library" and it has -a long-standing special relationship with pandas. Statsmodels provides powerful statistics, -econometrics, analysis and modeling functionality that is out of pandas' scope. -Statsmodels leverages pandas objects as the underlying data container for computation. - -`sklearn-pandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Use pandas DataFrames in your `scikit-learn `__ -ML pipeline. - -`Featuretools `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Featuretools is a Python library for automated feature engineering built on top of pandas. It excels at transforming temporal and relational datasets into feature matrices for machine learning using reusable feature engineering "primitives". Users can contribute their own primitives in Python and share them with the rest of the community. - -`Compose `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Compose is a machine learning tool for labeling data and prediction engineering. It allows you to structure the labeling process by parameterizing prediction problems and transforming time-driven relational data into target values with cutoff times that can be used for supervised learning. - -`STUMPY `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -STUMPY is a powerful and scalable Python library for modern time series analysis. -At its core, STUMPY efficiently computes something called a -`matrix profile `__, -which can be used for a wide variety of time series data mining tasks. - -.. _ecosystem.visualization: - -Visualization -------------- - -`Pandas has its own Styler class for table visualization `_, and while -:ref:`pandas also has built-in support for data visualization through charts with matplotlib `, -there are a number of other pandas-compatible libraries. - -`Altair `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Altair is a declarative statistical visualization library for Python. -With Altair, you can spend more time understanding your data and its -meaning. Altair's API is simple, friendly and consistent and built on -top of the powerful Vega-Lite JSON specification. This elegant -simplicity produces beautiful and effective visualizations with a -minimal amount of code. Altair works with pandas DataFrames. - - -`Bokeh `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Bokeh is a Python interactive visualization library for large datasets that natively uses -the latest web technologies. Its goal is to provide elegant, concise construction of novel -graphics in the style of Protovis/D3, while delivering high-performance interactivity over -large data to thin clients. - -`Pandas-Bokeh `__ provides a high level API -for Bokeh that can be loaded as a native pandas plotting backend via - -.. code:: python - - pd.set_option("plotting.backend", "pandas_bokeh") - -It is very similar to the matplotlib plotting backend, but provides interactive -web-based charts and maps. - - -`Seaborn `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Seaborn is a Python visualization library based on -`matplotlib `__. It provides a high-level, dataset-oriented -interface for creating attractive statistical graphics. The plotting functions -in seaborn understand pandas objects and leverage pandas grouping operations -internally to support concise specification of complex visualizations. Seaborn -also goes beyond matplotlib and pandas with the option to perform statistical -estimation while plotting, aggregating across observations and visualizing the -fit of statistical models to emphasize patterns in a dataset. - -`plotnine `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Hadley Wickham's `ggplot2 `__ is a foundational exploratory visualization package for the R language. -Based on `"The Grammar of Graphics" `__ it -provides a powerful, declarative and extremely general way to generate bespoke plots of any kind of data. -Various implementations to other languages are available. -A good implementation for Python users is `has2k1/plotnine `__. - -`IPython vega `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -`IPython Vega `__ leverages `Vega -`__ to create plots within Jupyter Notebook. - -`Plotly `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -`Plotly’s `__ `Python API `__ enables interactive figures and web shareability. Maps, 2D, 3D, and live-streaming graphs are rendered with WebGL and `D3.js `__. The library supports plotting directly from a pandas DataFrame and cloud-based collaboration. Users of `matplotlib, ggplot for Python, and Seaborn `__ can convert figures into interactive web-based plots. Plots can be drawn in `IPython Notebooks `__ , edited with R or MATLAB, modified in a GUI, or embedded in apps and dashboards. Plotly is free for unlimited sharing, and has `offline `__, or `on-premise `__ accounts for private use. - -`Lux `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -`Lux `__ is a Python library that facilitates fast and easy experimentation with data by automating the visual data exploration process. To use Lux, simply add an extra import alongside pandas: - -.. code:: python - - import lux - import pandas as pd - - df = pd.read_csv("data.csv") - df # discover interesting insights! - -By printing out a dataframe, Lux automatically `recommends a set of visualizations `__ that highlights interesting trends and patterns in the dataframe. Users can leverage any existing pandas commands without modifying their code, while being able to visualize their pandas data structures (e.g., DataFrame, Series, Index) at the same time. Lux also offers a `powerful, intuitive language `__ that allow users to create `Altair `__, `matplotlib `__, or `Vega-Lite `__ visualizations without having to think at the level of code. - -`Qtpandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Spun off from the main pandas library, the `qtpandas `__ -library enables DataFrame visualization and manipulation in PyQt4 and PySide applications. - -`D-Tale `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -D-Tale is a lightweight web client for visualizing pandas data structures. It -provides a rich spreadsheet-style grid which acts as a wrapper for a lot of -pandas functionality (query, sort, describe, corr...) so users can quickly -manipulate their data. There is also an interactive chart-builder using Plotly -Dash allowing users to build nice portable visualizations. D-Tale can be -invoked with the following command - -.. code:: python - - import dtale - - dtale.show(df) - -D-Tale integrates seamlessly with Jupyter notebooks, Python terminals, Kaggle -& Google Colab. Here are some demos of the `grid `__. - -`hvplot `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -hvPlot is a high-level plotting API for the PyData ecosystem built on `HoloViews `__. -It can be loaded as a native pandas plotting backend via - -.. code:: python - - pd.set_option("plotting.backend", "hvplot") - -.. _ecosystem.ide: - -IDE ---- - -`IPython `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -IPython is an interactive command shell and distributed computing -environment. IPython tab completion works with pandas methods and also -attributes like DataFrame columns. - -`Jupyter Notebook / Jupyter Lab `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Jupyter Notebook is a web application for creating Jupyter notebooks. -A Jupyter notebook is a JSON document containing an ordered list -of input/output cells which can contain code, text, mathematics, plots -and rich media. -Jupyter notebooks can be converted to a number of open standard output formats -(HTML, HTML presentation slides, LaTeX, PDF, ReStructuredText, Markdown, -Python) through 'Download As' in the web interface and ``jupyter convert`` -in a shell. - -pandas DataFrames implement ``_repr_html_`` and ``_repr_latex`` methods -which are utilized by Jupyter Notebook for displaying -(abbreviated) HTML or LaTeX tables. LaTeX output is properly escaped. -(Note: HTML tables may or may not be -compatible with non-HTML Jupyter output formats.) - -See :ref:`Options and Settings ` and -:ref:`Available Options ` -for pandas ``display.`` settings. - -`Quantopian/qgrid `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -qgrid is "an interactive grid for sorting and filtering -DataFrames in IPython Notebook" built with SlickGrid. - -`Spyder `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Spyder is a cross-platform PyQt-based IDE combining the editing, analysis, -debugging and profiling functionality of a software development tool with the -data exploration, interactive execution, deep inspection and rich visualization -capabilities of a scientific environment like MATLAB or Rstudio. - -Its `Variable Explorer `__ -allows users to view, manipulate and edit pandas ``Index``, ``Series``, -and ``DataFrame`` objects like a "spreadsheet", including copying and modifying -values, sorting, displaying a "heatmap", converting data types and more. -pandas objects can also be renamed, duplicated, new columns added, -copied/pasted to/from the clipboard (as TSV), and saved/loaded to/from a file. -Spyder can also import data from a variety of plain text and binary files -or the clipboard into a new pandas DataFrame via a sophisticated import wizard. - -Most pandas classes, methods and data attributes can be autocompleted in -Spyder's `Editor `__ and -`IPython Console `__, -and Spyder's `Help pane `__ can retrieve -and render Numpydoc documentation on pandas objects in rich text with Sphinx -both automatically and on-demand. - - -.. _ecosystem.api: - -API ---- - -`pandas-datareader `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``pandas-datareader`` is a remote data access library for pandas (PyPI:``pandas-datareader``). -It is based on functionality that was located in ``pandas.io.data`` and ``pandas.io.wb`` but was -split off in v0.19. -See more in the `pandas-datareader docs `_: - -The following data feeds are available: - - * Google Finance - * Tiingo - * Morningstar - * IEX - * Robinhood - * Enigma - * Quandl - * FRED - * Fama/French - * World Bank - * OECD - * Eurostat - * TSP Fund Data - * Nasdaq Trader Symbol Definitions - * Stooq Index Data - * MOEX Data - -`Quandl/Python `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Quandl API for Python wraps the Quandl REST API to return -pandas DataFrames with timeseries indexes. - -`Pydatastream `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -PyDatastream is a Python interface to the -`Refinitiv Datastream (DWS) `__ -REST API to return indexed pandas DataFrames with financial data. -This package requires valid credentials for this API (non free). - -`pandaSDMX `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -pandaSDMX is a library to retrieve and acquire statistical data -and metadata disseminated in -`SDMX `_ 2.1, an ISO-standard -widely used by institutions such as statistics offices, central banks, -and international organisations. pandaSDMX can expose datasets and related -structural metadata including data flows, code-lists, -and data structure definitions as pandas Series -or MultiIndexed DataFrames. - -`fredapi `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -fredapi is a Python interface to the `Federal Reserve Economic Data (FRED) `__ -provided by the Federal Reserve Bank of St. Louis. It works with both the FRED database and ALFRED database that -contains point-in-time data (i.e. historic data revisions). fredapi provides a wrapper in Python to the FRED -HTTP API, and also provides several convenient methods for parsing and analyzing point-in-time data from ALFRED. -fredapi makes use of pandas and returns data in a Series or DataFrame. This module requires a FRED API key that -you can obtain for free on the FRED website. - -`dataframe_sql `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``dataframe_sql`` is a Python package that translates SQL syntax directly into -operations on pandas DataFrames. This is useful when migrating from a database to -using pandas or for users more comfortable with SQL looking for a way to interface -with pandas. - - -.. _ecosystem.domain: - -Domain specific ---------------- - -`Geopandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Geopandas extends pandas data objects to include geographic information which support -geometric operations. If your work entails maps and geographical coordinates, and -you love pandas, you should take a close look at Geopandas. - -`staircase `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -staircase is a data analysis package, built upon pandas and numpy, for modelling and -manipulation of mathematical step functions. It provides a rich variety of arithmetic -operations, relational operations, logical operations, statistical operations and -aggregations for step functions defined over real numbers, datetime and timedelta domains. - - -`xarray `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -xarray brings the labeled data power of pandas to the physical sciences by -providing N-dimensional variants of the core pandas data structures. It aims to -provide a pandas-like and pandas-compatible toolkit for analytics on -multi-dimensional arrays, rather than the tabular data for which pandas excels. - - -.. _ecosystem.io: - -IO --- - -`BCPandas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -BCPandas provides high performance writes from pandas to Microsoft SQL Server, -far exceeding the performance of the native ``df.to_sql`` method. Internally, it uses -Microsoft's BCP utility, but the complexity is fully abstracted away from the end user. -Rigorously tested, it is a complete replacement for ``df.to_sql``. - -`Deltalake `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Deltalake python package lets you access tables stored in -`Delta Lake `__ natively in Python without the need to use Spark or -JVM. It provides the ``delta_table.to_pyarrow_table().to_pandas()`` method to convert -any Delta table into Pandas dataframe. - - -.. _ecosystem.out-of-core: - -Out-of-core ------------ - -`Blaze `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Blaze provides a standard API for doing computations with various -in-memory and on-disk backends: NumPy, pandas, SQLAlchemy, MongoDB, PyTables, -PySpark. - -`Cylon `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Cylon is a fast, scalable, distributed memory parallel runtime with a pandas -like Python DataFrame API. ”Core Cylon” is implemented with C++ using Apache -Arrow format to represent the data in-memory. Cylon DataFrame API implements -most of the core operators of pandas such as merge, filter, join, concat, -group-by, drop_duplicates, etc. These operators are designed to work across -thousands of cores to scale applications. It can interoperate with pandas -DataFrame by reading data from pandas or converting data to pandas so users -can selectively scale parts of their pandas DataFrame applications. - -.. code:: python - - from pycylon import read_csv, DataFrame, CylonEnv - from pycylon.net import MPIConfig - - # Initialize Cylon distributed environment - config: MPIConfig = MPIConfig() - env: CylonEnv = CylonEnv(config=config, distributed=True) - - df1: DataFrame = read_csv('/tmp/csv1.csv') - df2: DataFrame = read_csv('/tmp/csv2.csv') - - # Using 1000s of cores across the cluster to compute the join - df3: Table = df1.join(other=df2, on=[0], algorithm="hash", env=env) - - print(df3) - -`Dask `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Dask is a flexible parallel computing library for analytics. Dask -provides a familiar ``DataFrame`` interface for out-of-core, parallel and distributed computing. - -`Dask-ML `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Dask-ML enables parallel and distributed machine learning using Dask alongside existing machine learning libraries like Scikit-Learn, XGBoost, and TensorFlow. - -`Ibis `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Ibis offers a standard way to write analytics code, that can be run in multiple engines. It helps in bridging the gap between local Python environments (like pandas) and remote storage and execution systems like Hadoop components (like HDFS, Impala, Hive, Spark) and SQL databases (Postgres, etc.). - - -`Koalas `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Koalas provides a familiar pandas DataFrame interface on top of Apache Spark. It enables users to leverage multi-cores on one machine or a cluster of machines to speed up or scale their DataFrame code. - -`Modin `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The ``modin.pandas`` DataFrame is a parallel and distributed drop-in replacement -for pandas. This means that you can use Modin with existing pandas code or write -new code with the existing pandas API. Modin can leverage your entire machine or -cluster to speed up and scale your pandas workloads, including traditionally -time-consuming tasks like ingesting data (``read_csv``, ``read_excel``, -``read_parquet``, etc.). - -.. code:: python - - # import pandas as pd - import modin.pandas as pd - - df = pd.read_csv("big.csv") # use all your cores! - -`Odo `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Odo provides a uniform API for moving data between different formats. It uses -pandas own ``read_csv`` for CSV IO and leverages many existing packages such as -PyTables, h5py, and pymongo to move data between non pandas formats. Its graph -based approach is also extensible by end users for custom formats that may be -too specific for the core of odo. - -`Pandarallel `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Pandarallel provides a simple way to parallelize your pandas operations on all your CPUs by changing only one line of code. -If also displays progress bars. - -.. code:: python - - from pandarallel import pandarallel - - pandarallel.initialize(progress_bar=True) - - # df.apply(func) - df.parallel_apply(func) - - -`Vaex `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Increasingly, packages are being built on top of pandas to address specific needs in data preparation, analysis and visualization. Vaex is a Python library for Out-of-Core DataFrames (similar to pandas), to visualize and explore big tabular datasets. It can calculate statistics such as mean, sum, count, standard deviation etc, on an N-dimensional grid up to a billion (10\ :sup:`9`) objects/rows per second. Visualization is done using histograms, density plots and 3d volume rendering, allowing interactive exploration of big data. Vaex uses memory mapping, zero memory copy policy and lazy computations for best performance (no memory wasted). - - * vaex.from_pandas - * vaex.to_pandas_df - -.. _ecosystem.extensions: - -Extension data types --------------------- - -pandas provides an interface for defining -:ref:`extension types ` to extend NumPy's type -system. The following libraries implement that interface to provide types not -found in NumPy or pandas, which work well with pandas' data containers. - -`Cyberpandas`_ -~~~~~~~~~~~~~~ - -Cyberpandas provides an extension type for storing arrays of IP Addresses. These -arrays can be stored inside pandas' Series and DataFrame. - -`Pandas-Genomics`_ -~~~~~~~~~~~~~~~~~~ - -Pandas-Genomics provides extension types, extension arrays, and extension accessors for working with genomics data - -`Pint-Pandas`_ -~~~~~~~~~~~~~~ - -`Pint-Pandas `_ provides an extension type for -storing numeric arrays with units. These arrays can be stored inside pandas' -Series and DataFrame. Operations between Series and DataFrame columns which -use pint's extension array are then units aware. - -`Text Extensions for Pandas`_ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -`Text Extensions for Pandas `_ -provides extension types to cover common data structures for representing natural language -data, plus library integrations that convert the outputs of popular natural language -processing libraries into Pandas DataFrames. - -.. _ecosystem.accessors: - -Accessors ---------- - -A directory of projects providing -:ref:`extension accessors `. This is for users to -discover new accessors and for library authors to coordinate on the namespace. - -================== ============ ==================================== =============================================================================== -Library Accessor Classes Description -================== ============ ==================================== =============================================================================== -`cyberpandas`_ ``ip`` ``Series`` Provides common operations for working with IP addresses. -`pdvega`_ ``vgplot`` ``Series``, ``DataFrame`` Provides plotting functions from the Altair_ library. -`pandas-genomics`_ ``genomics`` ``Series``, ``DataFrame`` Provides common operations for quality control and analysis of genomics data. -`pandas_path`_ ``path`` ``Index``, ``Series`` Provides `pathlib.Path`_ functions for Series. -`pint-pandas`_ ``pint`` ``Series``, ``DataFrame`` Provides units support for numeric Series and DataFrames. -`composeml`_ ``slice`` ``DataFrame`` Provides a generator for enhanced data slicing. -`datatest`_ ``validate`` ``Series``, ``DataFrame``, ``Index`` Provides validation, differences, and acceptance managers. -`woodwork`_ ``ww`` ``Series``, ``DataFrame`` Provides physical, logical, and semantic data typing information for Series and DataFrames. -`staircase`_ ``sc`` ``Series`` Provides methods for querying, aggregating and plotting step functions -================== ============ ==================================== =============================================================================== - -.. _cyberpandas: https://cyberpandas.readthedocs.io/en/latest -.. _pdvega: https://altair-viz.github.io/pdvega/ -.. _Altair: https://altair-viz.github.io/ -.. _pandas-genomics: https://pandas-genomics.readthedocs.io/en/latest/ -.. _pandas_path: https://github.com/drivendataorg/pandas-path/ -.. _pathlib.Path: https://docs.python.org/3/library/pathlib.html -.. _pint-pandas: https://github.com/hgrecco/pint-pandas -.. _composeml: https://github.com/alteryx/compose -.. _datatest: https://datatest.readthedocs.io/en/stable/ -.. _woodwork: https://github.com/alteryx/woodwork -.. _staircase: https://www.staircase.dev/ - -Development tools ------------------ - -`pandas-stubs `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -While pandas repository is partially typed, the package itself doesn't expose this information for external use. -Install pandas-stubs to enable basic type coverage of pandas API. - -Learn more by reading through :issue:`14468`, :issue:`26766`, :issue:`28142`. - -See installation and usage instructions on the `GitHub page `__. - -`Hamilton `__ -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Hamilton is a declarative dataflow framework that came out of Stitch Fix. It was designed to help one manage a Pandas code base, specifically with respect to feature engineering for machine learning models. - -It prescibes an opinionated paradigm, that ensures all code is: - -* unit testable -* integration testing friendly -* documentation friendly -* transformation logic is reusable, as it is decoupled from the context of where it is used. -* integratable with runtime data quality checks. - -This helps one to scale your pandas code base, at the same time, keeping maintenance costs low. - -For more information, see `documentation `__. diff --git a/doc/source/getting_started/comparison/comparison_with_r.rst b/doc/source/getting_started/comparison/comparison_with_r.rst index 767779b0f58a8..25ba237e8caf3 100644 --- a/doc/source/getting_started/comparison/comparison_with_r.rst +++ b/doc/source/getting_started/comparison/comparison_with_r.rst @@ -246,7 +246,7 @@ In pandas we may use :meth:`~pandas.pivot_table` method to handle this: } ) - baseball.pivot_table(values="batting avg", columns="team", aggfunc=np.max) + baseball.pivot_table(values="batting avg", columns="team", aggfunc="max") For more details and examples see :ref:`the reshaping documentation `. @@ -359,7 +359,7 @@ In pandas the equivalent expression, using the ) grouped = df.groupby(["month", "week"]) - grouped["x"].agg([np.mean, np.std]) + grouped["x"].agg(["mean", "std"]) For more details and examples see :ref:`the groupby documentation @@ -482,7 +482,7 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: values="value", index=["variable", "week"], columns=["month"], - aggfunc=np.mean, + aggfunc="mean", ) Similarly for ``dcast`` which uses a data.frame called ``df`` in R to diff --git a/doc/source/getting_started/comparison/comparison_with_sql.rst b/doc/source/getting_started/comparison/comparison_with_sql.rst index a6d9d65e85645..7a83d50416186 100644 --- a/doc/source/getting_started/comparison/comparison_with_sql.rst +++ b/doc/source/getting_started/comparison/comparison_with_sql.rst @@ -198,7 +198,7 @@ to your grouped DataFrame, indicating which functions to apply to specific colum .. ipython:: python - tips.groupby("day").agg({"tip": np.mean, "day": np.size}) + tips.groupby("day").agg({"tip": "mean", "day": "size"}) Grouping by more than one column is done by passing a list of columns to the :meth:`~pandas.DataFrame.groupby` method. @@ -222,7 +222,7 @@ Grouping by more than one column is done by passing a list of columns to the .. ipython:: python - tips.groupby(["smoker", "day"]).agg({"tip": [np.size, np.mean]}) + tips.groupby(["smoker", "day"]).agg({"tip": ["size", "mean"]}) .. _compare_with_sql.join: diff --git a/doc/source/getting_started/comparison/includes/missing.rst b/doc/source/getting_started/comparison/includes/missing.rst index 341c7d5498d82..ab5d90166e7b0 100644 --- a/doc/source/getting_started/comparison/includes/missing.rst +++ b/doc/source/getting_started/comparison/includes/missing.rst @@ -19,7 +19,7 @@ Forward fill from previous rows .. ipython:: python - outer_join.fillna(method="ffill") + outer_join.ffill() Replace missing values with a specified value ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/getting_started/index.rst b/doc/source/getting_started/index.rst index d344c0750b73d..d9cb1de14aded 100644 --- a/doc/source/getting_started/index.rst +++ b/doc/source/getting_started/index.rst @@ -9,48 +9,53 @@ Getting started Installation ------------ -.. panels:: - :card: + install-card - :column: col-lg-6 col-md-6 col-sm-12 col-xs-12 p-3 +.. grid:: 1 2 2 2 + :gutter: 4 - Working with conda? - ^^^^^^^^^^^^^^^^^^^ + .. grid-item-card:: Working with conda? + :class-card: install-card + :columns: 12 12 6 6 + :padding: 3 - pandas is part of the `Anaconda `__ - distribution and can be installed with Anaconda or Miniconda: + pandas is part of the `Anaconda `__ + distribution and can be installed with Anaconda or Miniconda: - ++++++++++++++++++++++ + ++++++++++++++++++++++ - .. code-block:: bash + .. code-block:: bash - conda install pandas + conda install -c conda-forge pandas - --- + .. grid-item-card:: Prefer pip? + :class-card: install-card + :columns: 12 12 6 6 + :padding: 3 - Prefer pip? - ^^^^^^^^^^^ + pandas can be installed via pip from `PyPI `__. - pandas can be installed via pip from `PyPI `__. + ++++ - ++++ + .. code-block:: bash - .. code-block:: bash + pip install pandas - pip install pandas + .. grid-item-card:: In-depth instructions? + :class-card: install-card + :columns: 12 + :padding: 3 - --- - :column: col-12 p-3 + Installing a specific version? Installing from source? Check the advanced + installation page. - In-depth instructions? - ^^^^^^^^^^^^^^^^^^^^^^ + +++ - Installing a specific version? Installing from source? Check the advanced - installation page. + .. button-ref:: install + :ref-type: ref + :click-parent: + :color: secondary + :expand: - .. link-button:: ./install.html - :type: url - :text: Learn more - :classes: btn-secondary stretched-link + Learn more .. _gentle_intro: @@ -64,7 +69,7 @@ Intro to pandas
-