From 87d6e86275a251426354462187f5addf785b489d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 25 Mar 2023 14:47:19 -0700 Subject: [PATCH 01/10] CI: Test pyarrow nightly instead of intermediate versions --- .github/actions/setup-conda/action.yml | 11 --------- .github/workflows/macos-windows.yml | 1 - .github/workflows/ubuntu.yml | 22 ++++-------------- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 28 +++++++++++++++++++++++ ci/deps/actions-311.yaml | 2 +- ci/deps/actions-38-downstream_compat.yaml | 2 +- ci/deps/actions-38.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/circle-38-arm64.yaml | 2 +- scripts/validate_min_versions_in_sync.py | 2 +- 11 files changed, 40 insertions(+), 36 deletions(-) create mode 100644 ci/deps/actions-311-pyarrownightly.yaml diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index efc31bba88f28..329dc24d466b4 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -9,20 +9,9 @@ inputs: extra-specs: description: Extra packages to install required: false - pyarrow-version: - description: If set, overrides the PyArrow version in the Conda environment to the given string. - required: false runs: using: composite steps: - - name: Set Arrow version in ${{ inputs.environment-file }} to ${{ inputs.pyarrow-version }} - run: | - grep -q ' - pyarrow' ${{ inputs.environment-file }} - sed -i"" -e "s/ - pyarrow/ - pyarrow=${{ inputs.pyarrow-version }}/" ${{ inputs.environment-file }} - cat ${{ inputs.environment-file }} - shell: bash - if: ${{ inputs.pyarrow-version }} - - name: Install ${{ inputs.environment-file }} uses: mamba-org/provision-with-micromamba@v12 with: diff --git a/.github/workflows/macos-windows.yml b/.github/workflows/macos-windows.yml index 15308d0c086f6..7ed5f5b90b959 100644 --- a/.github/workflows/macos-windows.yml +++ b/.github/workflows/macos-windows.yml @@ -52,7 +52,6 @@ jobs: uses: ./.github/actions/setup-conda with: environment-file: ci/deps/${{ matrix.env_file }} - pyarrow-version: ${{ matrix.os == 'macos-latest' && '9' || '' }} - name: Build Pandas uses: ./.github/actions/build_pandas diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index f990c6edaeb2a..a2ee2d24c7030 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -27,7 +27,6 @@ jobs: matrix: env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml, actions-311.yaml] pattern: ["not single_cpu", "single_cpu"] - pyarrow_version: ["8", "9", "10"] include: - name: "Downstream Compat" env_file: actions-38-downstream_compat.yaml @@ -75,21 +74,11 @@ jobs: # TODO(cython3): Re-enable once next-beta(after beta 1) comes out # There are some warnings failing the build with -werror pandas_ci: "0" - exclude: - - env_file: actions-38.yaml - pyarrow_version: "8" - - env_file: actions-38.yaml - pyarrow_version: "9" - - env_file: actions-39.yaml - pyarrow_version: "8" - - env_file: actions-39.yaml - pyarrow_version: "9" - - env_file: actions-310.yaml - pyarrow_version: "8" - - env_file: actions-310.yaml - pyarrow_version: "9" + - name: "Pyarrow Nightly" + env_file: actions-311-pyarrownightly.yaml + pattern: "not slow and not network and not single_cpu" fail-fast: false - name: ${{ matrix.name || format('{0} pyarrow={1} {2}', matrix.env_file, matrix.pyarrow_version, matrix.pattern) }} + name: ${{ matrix.name || format('{0} {2}', matrix.env_file, matrix.pattern) }} env: ENV_FILE: ci/deps/${{ matrix.env_file }} PATTERN: ${{ matrix.pattern }} @@ -107,7 +96,7 @@ jobs: COVERAGE: ${{ !contains(matrix.env_file, 'pypy') }} concurrency: # https://github.community/t/concurrecy-not-work-for-push/183068/7 - group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.pyarrow_version || '' }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }} + group: ${{ github.event_name == 'push' && github.run_number || github.ref }}-${{ matrix.env_file }}-${{ matrix.pattern }}-${{ matrix.extra_apt || '' }}-${{ matrix.pandas_data_manager || '' }} cancel-in-progress: true services: @@ -166,7 +155,6 @@ jobs: uses: ./.github/actions/setup-conda with: environment-file: ${{ env.ENV_FILE }} - pyarrow-version: ${{ matrix.pyarrow_version }} - name: Build Pandas uses: ./.github/actions/build_pandas diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index f40b555593f6b..47405b72476fd 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -41,7 +41,7 @@ dependencies: - psycopg2>=2.8.6 - pymysql>=1.0.2 - pytables>=3.6.1 - - pyarrow + - pyarrow>=7.0.0 - pyreadstat>=1.1.2 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml new file mode 100644 index 0000000000000..494fca0c6c4cd --- /dev/null +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -0,0 +1,28 @@ +name: pandas-dev +channels: + - arrow-nightlies + - conda-forge +dependencies: + - python=3.11 + + # build dependencies + - versioneer[toml] + - cython>=0.29.33 + + # test dependencies + - pytest>=7.0.0 + - pytest-cov + - pytest-xdist>=2.2.0 + - pytest-asyncio>=0.17.0 + - boto3 + + # required dependencies + - python-dateutil + - numpy + - pytz + + # optional dependencies + - pyarrow + + - pip: + - tzdata>=2022.1 diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index fa08bdf438dff..9ebfb710e0abb 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -41,7 +41,7 @@ dependencies: - psycopg2>=2.8.6 - pymysql>=1.0.2 # - pytables>=3.8.0 # first version that supports 3.11 - - pyarrow + - pyarrow>=7.0.0 - pyreadstat>=1.1.2 - python-snappy>=0.6.0 - pyxlsb>=1.0.8 diff --git a/ci/deps/actions-38-downstream_compat.yaml b/ci/deps/actions-38-downstream_compat.yaml index a9265bd84ee87..3ed2786b76896 100644 --- a/ci/deps/actions-38-downstream_compat.yaml +++ b/ci/deps/actions-38-downstream_compat.yaml @@ -39,7 +39,7 @@ dependencies: - openpyxl<3.1.1, >=3.0.7 - odfpy>=1.4.1 - psycopg2>=2.8.6 - - pyarrow + - pyarrow>=7.0.0 - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 diff --git a/ci/deps/actions-38.yaml b/ci/deps/actions-38.yaml index 27872514447a5..4060a837d1757 100644 --- a/ci/deps/actions-38.yaml +++ b/ci/deps/actions-38.yaml @@ -39,7 +39,7 @@ dependencies: - odfpy>=1.4.1 - pandas-gbq>=0.15.0 - psycopg2>=2.8.6 - - pyarrow + - pyarrow>=7.0.0 - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 4b0575d8a3afd..53cd9c5635493 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -40,7 +40,7 @@ dependencies: - pandas-gbq>=0.15.0 - psycopg2>=2.8.6 - pymysql>=1.0.2 - - pyarrow + - pyarrow>=7.0.0 - pyreadstat>=1.1.2 - pytables>=3.6.1 - python-snappy>=0.6.0 diff --git a/ci/deps/circle-38-arm64.yaml b/ci/deps/circle-38-arm64.yaml index c3d89e735ae37..2e4070fa82010 100644 --- a/ci/deps/circle-38-arm64.yaml +++ b/ci/deps/circle-38-arm64.yaml @@ -39,7 +39,7 @@ dependencies: - odfpy>=1.4.1 - pandas-gbq>=0.15.0 - psycopg2>=2.8.6 - - pyarrow + - pyarrow>=7.0.0 - pymysql>=1.0.2 # Not provided on ARM #- pyreadstat diff --git a/scripts/validate_min_versions_in_sync.py b/scripts/validate_min_versions_in_sync.py index b6016a35e3dbb..e0182ebaaee60 100755 --- a/scripts/validate_min_versions_in_sync.py +++ b/scripts/validate_min_versions_in_sync.py @@ -37,7 +37,7 @@ YAML_PATH = pathlib.Path("ci/deps") ENV_PATH = pathlib.Path("environment.yml") EXCLUDE_DEPS = {"tzdata", "blosc"} -EXCLUSION_LIST = frozenset(["python=3.8[build=*_pypy]", "pyarrow"]) +EXCLUSION_LIST = frozenset(["python=3.8[build=*_pypy]"]) # pandas package is not available # in pre-commit environment sys.path.append("pandas/compat") From 7703e6aa66342fc3c38b756e43d8a27e98b5bfa7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Sat, 25 Mar 2023 14:48:13 -0700 Subject: [PATCH 02/10] Change format --- .github/workflows/ubuntu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index a2ee2d24c7030..441aa98cf7aa0 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -78,7 +78,7 @@ jobs: env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" fail-fast: false - name: ${{ matrix.name || format('{0} {2}', matrix.env_file, matrix.pattern) }} + name: ${{ matrix.name || format('{0} {1}', matrix.env_file, matrix.pattern) }} env: ENV_FILE: ci/deps/${{ matrix.env_file }} PATTERN: ${{ matrix.pattern }} From 9b8cdb5065dce288befb8083c2938319cc979617 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 27 Mar 2023 15:35:24 -0700 Subject: [PATCH 03/10] Pin, remove hardcoded channel --- .github/actions/setup-conda/action.yml | 1 - ci/deps/actions-311-pyarrownightly.yaml | 1 + environment.yml | 2 +- requirements-dev.txt | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 329dc24d466b4..85261a3931c45 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -18,7 +18,6 @@ runs: environment-file: ${{ inputs.environment-file }} environment-name: ${{ inputs.environment-name }} extra-specs: ${{ inputs.extra-specs }} - channels: conda-forge channel-priority: 'strict' condarc-file: ci/condarc.yml cache-env: true diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 494fca0c6c4cd..10bdfa27fac31 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -13,6 +13,7 @@ dependencies: - pytest>=7.0.0 - pytest-cov - pytest-xdist>=2.2.0 + - hypothesis>=6.34.2 - pytest-asyncio>=0.17.0 - boto3 diff --git a/environment.yml b/environment.yml index f29ade1dc5173..5aa1fad2e51c7 100644 --- a/environment.yml +++ b/environment.yml @@ -42,7 +42,7 @@ dependencies: - odfpy>=1.4.1 - py - psycopg2>=2.8.6 - - pyarrow + - pyarrow>=7.0.0 - pymysql>=1.0.2 - pyreadstat>=1.1.2 - pytables>=3.6.1 diff --git a/requirements-dev.txt b/requirements-dev.txt index 9c0bdc64d6e07..f3c9649a5a707 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -31,7 +31,7 @@ openpyxl<3.1.1, >=3.0.7 odfpy>=1.4.1 py psycopg2-binary>=2.8.6 -pyarrow +pyarrow>=7.0.0 pymysql>=1.0.2 pyreadstat>=1.1.2 tables>=3.6.1 From d195e1c1c0d77176ac6085527b25bf9b04bc7ca8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 27 Mar 2023 15:54:41 -0700 Subject: [PATCH 04/10] Try pip --- .github/actions/setup-conda/action.yml | 1 + ci/deps/actions-311-pyarrownightly.yaml | 11 ++++++----- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/actions/setup-conda/action.yml b/.github/actions/setup-conda/action.yml index 85261a3931c45..329dc24d466b4 100644 --- a/.github/actions/setup-conda/action.yml +++ b/.github/actions/setup-conda/action.yml @@ -18,6 +18,7 @@ runs: environment-file: ${{ inputs.environment-file }} environment-name: ${{ inputs.environment-name }} extra-specs: ${{ inputs.extra-specs }} + channels: conda-forge channel-priority: 'strict' condarc-file: ci/condarc.yml cache-env: true diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 10bdfa27fac31..8f0765044cd5f 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -1,6 +1,5 @@ name: pandas-dev channels: - - arrow-nightlies - conda-forge dependencies: - python=3.11 @@ -21,9 +20,11 @@ dependencies: - python-dateutil - numpy - pytz - - # optional dependencies - - pyarrow + - pip - pip: - - tzdata>=2022.1 + - "tzdata>=2022.1" + - "--extra-index-url https://pypi.fury.io/arrow-nightlies/" + - "--prefer-binary" + - "--pre" + - "pyarrow" From db122cfb1fa38b404637cdc8b858aab9fe0b052b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 27 Mar 2023 20:26:09 -0700 Subject: [PATCH 05/10] Fix some tests --- pandas/io/parquet.py | 20 ++++++++------------ pandas/tests/util/test_show_versions.py | 2 +- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7791ca53a6447..30dfceb29155a 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -92,22 +92,18 @@ def _get_path_or_handle( if fs is not None: pa_fs = import_optional_dependency("pyarrow.fs", errors="ignore") fsspec = import_optional_dependency("fsspec", errors="ignore") - if pa_fs is None and fsspec is None: - raise ValueError( - f"filesystem must be a pyarrow or fsspec FileSystem, " - f"not a {type(fs).__name__}" - ) - elif (pa_fs is not None and not isinstance(fs, pa_fs.FileSystem)) and ( - fsspec is not None and not isinstance(fs, fsspec.spec.AbstractFileSystem) - ): + if pa_fs is not None and isinstance(fs, pa_fs.FileSystem): + if storage_options: + raise NotImplementedError( + "storage_options not supported with a pyarrow FileSystem." + ) + elif fsspec is not None and isinstance(fs, fsspec.spec.AbstractFileSystem): + pass + else: raise ValueError( f"filesystem must be a pyarrow or fsspec FileSystem, " f"not a {type(fs).__name__}" ) - elif pa_fs is not None and isinstance(fs, pa_fs.FileSystem) and storage_options: - raise NotImplementedError( - "storage_options not supported with a pyarrow FileSystem." - ) if is_fsspec_url(path_or_handle) and fs is None: if storage_options is None: pa = import_optional_dependency("pyarrow") diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 8ff78cc073acf..8be5a4665424f 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -65,7 +65,7 @@ def test_show_versions_console(capsys): assert re.search(r"numpy\s*:\s[0-9]+\..*\n", result) # check optional dependency - assert re.search(r"pyarrow\s*:\s([0-9\.]+|None)\n", result) + assert re.search(r"pyarrow\s*:\s([0-9]+|None)\n", result) def test_json_output_match(capsys, tmpdir): From 22f97d26fb72fc636cf16bc321ce643a179c25cb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 28 Mar 2023 14:20:51 -0700 Subject: [PATCH 06/10] Address more tests --- pandas/tests/io/test_parquet.py | 3 +++ pandas/tests/util/test_show_versions.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b55e97a4fe0ae..78a9cf706347c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1019,7 +1019,10 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): {"a": [1, 2]}, index=pd.Index([3, 4], name="test"), dtype="int64[pyarrow]" ) expected = df.copy() + import pyarrow + if Version(pyarrow.__version__) < Version("12.0.0"): + expected.index = expected.index.astype("int64[pyarrow]") check_round_trip( df, engine=pa, diff --git a/pandas/tests/util/test_show_versions.py b/pandas/tests/util/test_show_versions.py index 8be5a4665424f..714588d179aef 100644 --- a/pandas/tests/util/test_show_versions.py +++ b/pandas/tests/util/test_show_versions.py @@ -65,7 +65,7 @@ def test_show_versions_console(capsys): assert re.search(r"numpy\s*:\s[0-9]+\..*\n", result) # check optional dependency - assert re.search(r"pyarrow\s*:\s([0-9]+|None)\n", result) + assert re.search(r"pyarrow\s*:\s([0-9]+.*|None)\n", result) def test_json_output_match(capsys, tmpdir): From aafc7607f76ac22b2cd352e945f45739d7531752 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 29 Mar 2023 09:28:25 -0700 Subject: [PATCH 07/10] Fix test condition --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 78a9cf706347c..c74548bf63e06 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -1021,7 +1021,7 @@ def test_read_dtype_backend_pyarrow_config_index(self, pa): expected = df.copy() import pyarrow - if Version(pyarrow.__version__) < Version("12.0.0"): + if Version(pyarrow.__version__) > Version("11.0.0"): expected.index = expected.index.astype("int64[pyarrow]") check_round_trip( df, From 509bd071458cb63fc9a788c15b6720d1dcfdb0b8 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 29 Mar 2023 09:36:22 -0700 Subject: [PATCH 08/10] Fix another condidition --- pandas/tests/arrays/string_/test_string.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index dd0b43c116266..7e4869589cee6 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -12,6 +12,7 @@ import pandas as pd import pandas._testing as tm from pandas.core.arrays.string_arrow import ArrowStringArray +from pandas.util.version import Version @pytest.fixture @@ -406,15 +407,14 @@ def test_fillna_args(dtype, request): arr.fillna(value=1) -@td.skip_if_no("pyarrow") def test_arrow_array(dtype): # protocol added in 0.15.0 - import pyarrow as pa + pa = pytest.importorskip("pyarrow") data = pd.array(["a", "b", "c"], dtype=dtype) arr = pa.array(data) expected = pa.array(list(data), type=pa.string(), from_pandas=True) - if dtype.storage == "pyarrow": + if dtype.storage == "pyarrow" and Version(pa.__version__) <= Version("11.0.0"): expected = pa.chunked_array(expected) assert arr.equals(expected) From 592cfe7a5572b76f82fe60b43de96052acbf93e5 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 29 Mar 2023 17:21:06 -0700 Subject: [PATCH 09/10] Cleanup name --- .github/workflows/ubuntu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ubuntu.yml b/.github/workflows/ubuntu.yml index cf9c3d252f67d..97ca346142ec1 100644 --- a/.github/workflows/ubuntu.yml +++ b/.github/workflows/ubuntu.yml @@ -79,7 +79,7 @@ jobs: env_file: actions-311-pyarrownightly.yaml pattern: "not slow and not network and not single_cpu" fail-fast: false - name: ${{ matrix.name || format('{0} {1}', matrix.env_file, matrix.pattern) }} + name: ${{ matrix.name || matrix.env_file }} env: ENV_FILE: ci/deps/${{ matrix.env_file }} PATTERN: ${{ matrix.pattern }} From 460da081ee5e79ef2e380c1e2f187698ac6b6e31 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 30 Mar 2023 10:05:46 -0700 Subject: [PATCH 10/10] Remove boto3 --- ci/deps/actions-311-pyarrownightly.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 8f0765044cd5f..77e4fc9d2c2d9 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -14,7 +14,6 @@ dependencies: - pytest-xdist>=2.2.0 - hypothesis>=6.34.2 - pytest-asyncio>=0.17.0 - - boto3 # required dependencies - python-dateutil