From 9c9bf7b51bfdf3d17ff04af95c3749c8befc11f1 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Sun, 8 Aug 2021 16:09:18 -0700 Subject: [PATCH 1/4] Backport PR #42919: COMPAT: Support fastparquet 0.7.1 --- ci/deps/actions-37-db.yaml | 2 +- ci/deps/azure-windows-38.yaml | 2 +- doc/source/whatsnew/v1.3.2.rst | 2 +- environment.yml | 2 +- pandas/io/parquet.py | 26 ++++++++--- pandas/tests/io/test_parquet.py | 80 ++++++++++++++++++++------------- requirements-dev.txt | 2 +- 7 files changed, 73 insertions(+), 43 deletions(-) diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml index cfdcf266236e6..a9e4113bf9d18 100644 --- a/ci/deps/actions-37-db.yaml +++ b/ci/deps/actions-37-db.yaml @@ -15,7 +15,7 @@ dependencies: - beautifulsoup4 - botocore>=1.11 - dask - - fastparquet>=0.4.0, < 0.7.0 + - fastparquet>=0.4.0 - fsspec>=0.7.4, <2021.6.0 - gcsfs>=0.6.0 - geopandas diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 902daf102ccda..70aa46e8a5851 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - blosc - bottleneck - - fastparquet>=0.4.0, <0.7.0 + - fastparquet>=0.4.0 - flask - fsspec>=0.8.0, <2021.6.0 - matplotlib=3.1.3 diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst index 669e824fa3989..b0ad5cd506fce 100644 --- a/doc/source/whatsnew/v1.3.2.rst +++ b/doc/source/whatsnew/v1.3.2.rst @@ -47,7 +47,7 @@ Bug fixes Other ~~~~~ -- +- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1. - .. --------------------------------------------------------------------------- diff --git a/environment.yml b/environment.yml index 20b7272e12ebb..e75e56238205b 100644 --- a/environment.yml +++ b/environment.yml @@ -99,7 +99,7 @@ dependencies: - xlwt - odfpy - - fastparquet>=0.3.2, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet + - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - python-snappy # required by pyarrow diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index b7523fada07d0..f0aeeb3e6c893 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -309,14 +309,21 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): + parquet_kwargs = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) - if use_nullable_dtypes: - raise ValueError( - "The 'use_nullable_dtypes' argument is not supported for the " - "fastparquet engine" - ) + # Technically works with 0.7.0, but was incorrect + # so lets just require 0.7.1 + if Version(self.api.__version__) >= Version("0.7.1"): + # Need to set even for use_nullable_dtypes = False, + # since our defaults differ + parquet_kwargs["pandas_nulls"] = use_nullable_dtypes + else: + if use_nullable_dtypes: + raise ValueError( + "The 'use_nullable_dtypes' argument is not supported for the " + "fastparquet engine for fastparquet versions less than 0.7.1" + ) path = stringify_path(path) - parquet_kwargs = {} handles = None if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") @@ -337,6 +344,7 @@ def read( path, "rb", is_text=False, storage_options=storage_options ) path = handles.handle + parquet_file = self.api.ParquetFile(path, **parquet_kwargs) result = parquet_file.to_pandas(columns=columns, **kwargs) @@ -470,7 +478,7 @@ def read_parquet( use_nullable_dtypes : bool, default False If True, use dtypes that use ``pd.NA`` as missing value indicator - for the resulting DataFrame (only applicable for ``engine="pyarrow"``). + for the resulting DataFrame. As new dtypes are added that support ``pd.NA`` in the future, the output with this option will change to use those dtypes. Note: this is an experimental option, and behaviour (e.g. additional @@ -478,6 +486,10 @@ def read_parquet( .. versionadded:: 1.2.0 + .. versionchanged:: 1.3.2 + ``use_nullable_dtypes`` now works with the the ``fastparquet`` engine + if ``fastparquet`` is version 0.7.1 or higher. + **kwargs Any additional kwargs are passed to the engine. diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d100c584b698a..b951e92c0fa9c 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -575,6 +575,46 @@ def test_write_column_index_nonstring(self, pa): msg = r"parquet must have string column names" self.check_error_on_write(df, engine, ValueError, msg) + def test_use_nullable_dtypes(self, engine): + import pyarrow.parquet as pq + + if engine == "fastparquet": + pytest.importorskip( + "fastparquet", + "0.7.1", + reason="fastparquet must be 0.7.1 or higher for nullable dtype support", + ) + + table = pyarrow.table( + { + "a": pyarrow.array([1, 2, 3, None], "int64"), + "b": pyarrow.array([1, 2, 3, None], "uint8"), + "c": pyarrow.array(["a", "b", "c", None]), + "d": pyarrow.array([True, False, True, None]), + } + ) + with tm.ensure_clean() as path: + # write manually with pyarrow to write integers + pq.write_table(table, path) + result1 = read_parquet(path, engine=engine) + result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True) + + assert result1["a"].dtype == np.dtype("float64") + expected = pd.DataFrame( + { + "a": pd.array([1, 2, 3, None], dtype="Int64"), + "b": pd.array([1, 2, 3, None], dtype="UInt8"), + "c": pd.array(["a", "b", "c", None], dtype="string"), + "d": pd.array([True, False, True, None], dtype="boolean"), + } + ) + if engine == "fastparquet": + # Fastparquet doesn't support string columns yet + # Only int and boolean + result2 = result2.drop("c", axis=1) + expected = expected.drop("c", axis=1) + tm.assert_frame_equal(result2, expected) + @pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning") class TestParquetPyArrow(Base): @@ -829,35 +869,6 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) - @td.skip_if_no("pyarrow") - def test_use_nullable_dtypes(self, pa): - import pyarrow.parquet as pq - - table = pyarrow.table( - { - "a": pyarrow.array([1, 2, 3, None], "int64"), - "b": pyarrow.array([1, 2, 3, None], "uint8"), - "c": pyarrow.array(["a", "b", "c", None]), - "d": pyarrow.array([True, False, True, None]), - } - ) - with tm.ensure_clean() as path: - # write manually with pyarrow to write integers - pq.write_table(table, path) - result1 = read_parquet(path) - result2 = read_parquet(path, use_nullable_dtypes=True) - - assert result1["a"].dtype == np.dtype("float64") - expected = pd.DataFrame( - { - "a": pd.array([1, 2, 3, None], dtype="Int64"), - "b": pd.array([1, 2, 3, None], dtype="UInt8"), - "c": pd.array(["a", "b", "c", None], dtype="string"), - "d": pd.array([True, False, True, None], dtype="boolean"), - } - ) - tm.assert_frame_equal(result2, expected) - def test_timestamp_nanoseconds(self, pa): # with version 2.0, pyarrow defaults to writing the nanoseconds, so # this should work without error @@ -928,7 +939,9 @@ def test_duplicate_columns(self, fp): def test_bool_with_none(self, fp): df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") - check_round_trip(df, fp, expected=expected) + # Fastparquet bug in 0.7.1 makes it so that this dtype becomes + # float64 + check_round_trip(df, fp, expected=expected, check_dtype=False) def test_unsupported(self, fp): @@ -1049,9 +1062,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list): expected.index.name = "index" check_round_trip(df, fp, expected=expected) - def test_use_nullable_dtypes_not_supported(self, fp): + def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp): df = pd.DataFrame({"a": [1, 2]}) + # This is supported now in fastparquet 0.7.1 and above actually + # Still need to ensure that this raises in all versions below + import fastparquet as fp + + monkeypatch.setattr(fp, "__version__", "0.4") with tm.ensure_clean() as path: df.to_parquet(path) with pytest.raises(ValueError, match="not supported for the fastparquet"): diff --git a/requirements-dev.txt b/requirements-dev.txt index 25ec5e1904d18..3b40c9c300ace 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -64,7 +64,7 @@ xlrd xlsxwriter xlwt odfpy -fastparquet>=0.3.2, <0.7.0 +fastparquet>=0.3.2 pyarrow>=0.17.0 python-snappy pyqt5>=5.9.2 From 16a2238c3bfd4c652059a6392fc71381251bf498 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Tue, 10 Aug 2021 13:03:56 -0700 Subject: [PATCH 2/4] Revert fastparquet nullable dtype support (#42954) --- doc/source/whatsnew/v1.3.2.rst | 1 - pandas/io/parquet.py | 27 ++++++++++----------------- pandas/tests/io/test_parquet.py | 11 ++++++----- 3 files changed, 16 insertions(+), 23 deletions(-) diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst index b0ad5cd506fce..ef8f8245c6640 100644 --- a/doc/source/whatsnew/v1.3.2.rst +++ b/doc/source/whatsnew/v1.3.2.rst @@ -47,7 +47,6 @@ Bug fixes Other ~~~~~ -- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1. - .. --------------------------------------------------------------------------- diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index f0aeeb3e6c893..49384cfb2e554 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -309,20 +309,16 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): - parquet_kwargs = {} + parquet_kwargs: dict[str, Any] = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) - # Technically works with 0.7.0, but was incorrect - # so lets just require 0.7.1 if Version(self.api.__version__) >= Version("0.7.1"): - # Need to set even for use_nullable_dtypes = False, - # since our defaults differ - parquet_kwargs["pandas_nulls"] = use_nullable_dtypes - else: - if use_nullable_dtypes: - raise ValueError( - "The 'use_nullable_dtypes' argument is not supported for the " - "fastparquet engine for fastparquet versions less than 0.7.1" - ) + # We are disabling nullable dtypes for fastparquet pending discussion + parquet_kwargs["pandas_nulls"] = False + if use_nullable_dtypes: + raise ValueError( + "The 'use_nullable_dtypes' argument is not supported for the " + "fastparquet engine" + ) path = stringify_path(path) handles = None if is_fsspec_url(path): @@ -478,7 +474,8 @@ def read_parquet( use_nullable_dtypes : bool, default False If True, use dtypes that use ``pd.NA`` as missing value indicator - for the resulting DataFrame. + for the resulting DataFrame. (only applicable for the ``pyarrow`` + engine) As new dtypes are added that support ``pd.NA`` in the future, the output with this option will change to use those dtypes. Note: this is an experimental option, and behaviour (e.g. additional @@ -486,10 +483,6 @@ def read_parquet( .. versionadded:: 1.2.0 - .. versionchanged:: 1.3.2 - ``use_nullable_dtypes`` now works with the the ``fastparquet`` engine - if ``fastparquet`` is version 0.7.1 or higher. - **kwargs Any additional kwargs are passed to the engine. diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b951e92c0fa9c..c0e4cde0f01f8 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -579,11 +579,9 @@ def test_use_nullable_dtypes(self, engine): import pyarrow.parquet as pq if engine == "fastparquet": - pytest.importorskip( - "fastparquet", - "0.7.1", - reason="fastparquet must be 0.7.1 or higher for nullable dtype support", - ) + # We are manually disabling fastparquet's + # nullable dtype support pending discussion + pytest.skip("Fastparquet nullable dtype support is disabled") table = pyarrow.table( { @@ -591,6 +589,8 @@ def test_use_nullable_dtypes(self, engine): "b": pyarrow.array([1, 2, 3, None], "uint8"), "c": pyarrow.array(["a", "b", "c", None]), "d": pyarrow.array([True, False, True, None]), + # Test that nullable dtypes used even in absence of nulls + "e": pyarrow.array([1, 2, 3, 4], "int64"), } ) with tm.ensure_clean() as path: @@ -606,6 +606,7 @@ def test_use_nullable_dtypes(self, engine): "b": pd.array([1, 2, 3, None], dtype="UInt8"), "c": pd.array(["a", "b", "c", None], dtype="string"), "d": pd.array([True, False, True, None], dtype="boolean"), + "e": pd.array([1, 2, 3, 4], dtype="Int64"), } ) if engine == "fastparquet": From 2dca892a7395d0362f5d1ab92f308c727c0a5d64 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Wed, 11 Aug 2021 11:54:14 -0700 Subject: [PATCH 3/4] Revert doc changes --- doc/source/whatsnew/v1.3.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst index ef8f8245c6640..669e824fa3989 100644 --- a/doc/source/whatsnew/v1.3.2.rst +++ b/doc/source/whatsnew/v1.3.2.rst @@ -48,6 +48,7 @@ Bug fixes Other ~~~~~ - +- .. --------------------------------------------------------------------------- From 1080e1187e0ce2e1c1817803a853a8b06eeb12e1 Mon Sep 17 00:00:00 2001 From: Thomas Li <47963215+lithomas1@users.noreply.github.com> Date: Wed, 11 Aug 2021 16:22:54 -0700 Subject: [PATCH 4/4] upgrade numpy? --- ci/deps/actions-37-db.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml index a9e4113bf9d18..9d680cb8338fd 100644 --- a/ci/deps/actions-37-db.yaml +++ b/ci/deps/actions-37-db.yaml @@ -25,7 +25,7 @@ dependencies: - flask - nomkl - numexpr - - numpy=1.17.* + - numpy=1.18.* - odfpy - openpyxl - pandas-gbq