diff --git a/ci/deps/actions-37-db.yaml b/ci/deps/actions-37-db.yaml index cfdcf266236e6..9d680cb8338fd 100644 --- a/ci/deps/actions-37-db.yaml +++ b/ci/deps/actions-37-db.yaml @@ -15,7 +15,7 @@ dependencies: - beautifulsoup4 - botocore>=1.11 - dask - - fastparquet>=0.4.0, < 0.7.0 + - fastparquet>=0.4.0 - fsspec>=0.7.4, <2021.6.0 - gcsfs>=0.6.0 - geopandas @@ -25,7 +25,7 @@ dependencies: - flask - nomkl - numexpr - - numpy=1.17.* + - numpy=1.18.* - odfpy - openpyxl - pandas-gbq diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 902daf102ccda..70aa46e8a5851 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - blosc - bottleneck - - fastparquet>=0.4.0, <0.7.0 + - fastparquet>=0.4.0 - flask - fsspec>=0.8.0, <2021.6.0 - matplotlib=3.1.3 diff --git a/environment.yml b/environment.yml index 20b7272e12ebb..e75e56238205b 100644 --- a/environment.yml +++ b/environment.yml @@ -99,7 +99,7 @@ dependencies: - xlwt - odfpy - - fastparquet>=0.3.2, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet + - fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet - pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather - python-snappy # required by pyarrow diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index b7523fada07d0..49384cfb2e554 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -309,14 +309,17 @@ def write( def read( self, path, columns=None, storage_options: StorageOptions = None, **kwargs ): + parquet_kwargs: dict[str, Any] = {} use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False) + if Version(self.api.__version__) >= Version("0.7.1"): + # We are disabling nullable dtypes for fastparquet pending discussion + parquet_kwargs["pandas_nulls"] = False if use_nullable_dtypes: raise ValueError( "The 'use_nullable_dtypes' argument is not supported for the " "fastparquet engine" ) path = stringify_path(path) - parquet_kwargs = {} handles = None if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") @@ -337,6 +340,7 @@ def read( path, "rb", is_text=False, storage_options=storage_options ) path = handles.handle + parquet_file = self.api.ParquetFile(path, **parquet_kwargs) result = parquet_file.to_pandas(columns=columns, **kwargs) @@ -470,7 +474,8 @@ def read_parquet( use_nullable_dtypes : bool, default False If True, use dtypes that use ``pd.NA`` as missing value indicator - for the resulting DataFrame (only applicable for ``engine="pyarrow"``). + for the resulting DataFrame. (only applicable for the ``pyarrow`` + engine) As new dtypes are added that support ``pd.NA`` in the future, the output with this option will change to use those dtypes. Note: this is an experimental option, and behaviour (e.g. additional diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index d100c584b698a..c0e4cde0f01f8 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -575,6 +575,47 @@ def test_write_column_index_nonstring(self, pa): msg = r"parquet must have string column names" self.check_error_on_write(df, engine, ValueError, msg) + def test_use_nullable_dtypes(self, engine): + import pyarrow.parquet as pq + + if engine == "fastparquet": + # We are manually disabling fastparquet's + # nullable dtype support pending discussion + pytest.skip("Fastparquet nullable dtype support is disabled") + + table = pyarrow.table( + { + "a": pyarrow.array([1, 2, 3, None], "int64"), + "b": pyarrow.array([1, 2, 3, None], "uint8"), + "c": pyarrow.array(["a", "b", "c", None]), + "d": pyarrow.array([True, False, True, None]), + # Test that nullable dtypes used even in absence of nulls + "e": pyarrow.array([1, 2, 3, 4], "int64"), + } + ) + with tm.ensure_clean() as path: + # write manually with pyarrow to write integers + pq.write_table(table, path) + result1 = read_parquet(path, engine=engine) + result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True) + + assert result1["a"].dtype == np.dtype("float64") + expected = pd.DataFrame( + { + "a": pd.array([1, 2, 3, None], dtype="Int64"), + "b": pd.array([1, 2, 3, None], dtype="UInt8"), + "c": pd.array(["a", "b", "c", None], dtype="string"), + "d": pd.array([True, False, True, None], dtype="boolean"), + "e": pd.array([1, 2, 3, 4], dtype="Int64"), + } + ) + if engine == "fastparquet": + # Fastparquet doesn't support string columns yet + # Only int and boolean + result2 = result2.drop("c", axis=1) + expected = expected.drop("c", axis=1) + tm.assert_frame_equal(result2, expected) + @pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning") class TestParquetPyArrow(Base): @@ -829,35 +870,6 @@ def test_additional_extension_types(self, pa): ) check_round_trip(df, pa) - @td.skip_if_no("pyarrow") - def test_use_nullable_dtypes(self, pa): - import pyarrow.parquet as pq - - table = pyarrow.table( - { - "a": pyarrow.array([1, 2, 3, None], "int64"), - "b": pyarrow.array([1, 2, 3, None], "uint8"), - "c": pyarrow.array(["a", "b", "c", None]), - "d": pyarrow.array([True, False, True, None]), - } - ) - with tm.ensure_clean() as path: - # write manually with pyarrow to write integers - pq.write_table(table, path) - result1 = read_parquet(path) - result2 = read_parquet(path, use_nullable_dtypes=True) - - assert result1["a"].dtype == np.dtype("float64") - expected = pd.DataFrame( - { - "a": pd.array([1, 2, 3, None], dtype="Int64"), - "b": pd.array([1, 2, 3, None], dtype="UInt8"), - "c": pd.array(["a", "b", "c", None], dtype="string"), - "d": pd.array([True, False, True, None], dtype="boolean"), - } - ) - tm.assert_frame_equal(result2, expected) - def test_timestamp_nanoseconds(self, pa): # with version 2.0, pyarrow defaults to writing the nanoseconds, so # this should work without error @@ -928,7 +940,9 @@ def test_duplicate_columns(self, fp): def test_bool_with_none(self, fp): df = pd.DataFrame({"a": [True, None, False]}) expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16") - check_round_trip(df, fp, expected=expected) + # Fastparquet bug in 0.7.1 makes it so that this dtype becomes + # float64 + check_round_trip(df, fp, expected=expected, check_dtype=False) def test_unsupported(self, fp): @@ -1049,9 +1063,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list): expected.index.name = "index" check_round_trip(df, fp, expected=expected) - def test_use_nullable_dtypes_not_supported(self, fp): + def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp): df = pd.DataFrame({"a": [1, 2]}) + # This is supported now in fastparquet 0.7.1 and above actually + # Still need to ensure that this raises in all versions below + import fastparquet as fp + + monkeypatch.setattr(fp, "__version__", "0.4") with tm.ensure_clean() as path: df.to_parquet(path) with pytest.raises(ValueError, match="not supported for the fastparquet"): diff --git a/requirements-dev.txt b/requirements-dev.txt index 25ec5e1904d18..3b40c9c300ace 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -64,7 +64,7 @@ xlrd xlsxwriter xlwt odfpy -fastparquet>=0.3.2, <0.7.0 +fastparquet>=0.3.2 pyarrow>=0.17.0 python-snappy pyqt5>=5.9.2