Skip to content

COMPAT: Support fastparquet 0.7.1 #42919

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion ci/deps/actions-38-db.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
- beautifulsoup4
- botocore>=1.11
- dask
- fastparquet>=0.4.0, < 0.7.0
- fastparquet>=0.4.0
- fsspec>=0.7.4, <2021.6.0
- gcsfs>=0.6.0
- geopandas
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/azure-windows-38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
# pandas dependencies
- blosc
- bottleneck
- fastparquet>=0.4.0, <0.7.0
- fastparquet>=0.4.0
- flask
- fsspec>=0.8.0, <2021.6.0
- matplotlib=3.3.2
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Bug fixes

Other
~~~~~
-
- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1.
-

.. ---------------------------------------------------------------------------
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ dependencies:
- xlwt
- odfpy

- fastparquet>=0.4.0, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet
- fastparquet>=0.4.0 # pandas.read_parquet, DataFrame.to_parquet
- pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
- python-snappy # required by pyarrow

Expand Down
26 changes: 19 additions & 7 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,14 +309,21 @@ def write(
def read(
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
):
parquet_kwargs = {}
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
if use_nullable_dtypes:
raise ValueError(
"The 'use_nullable_dtypes' argument is not supported for the "
"fastparquet engine"
)
# Technically works with 0.7.0, but was incorrect
# so lets just require 0.7.1
if Version(self.api.__version__) >= Version("0.7.1"):
# Need to set even for use_nullable_dtypes = False,
# since our defaults differ
parquet_kwargs["pandas_nulls"] = use_nullable_dtypes
else:
if use_nullable_dtypes:
raise ValueError(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this path tested?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"The 'use_nullable_dtypes' argument is not supported for the "
"fastparquet engine for fastparquet versions less than 0.7.1"
)
path = stringify_path(path)
parquet_kwargs = {}
handles = None
if is_fsspec_url(path):
fsspec = import_optional_dependency("fsspec")
Expand All @@ -337,6 +344,7 @@ def read(
path, "rb", is_text=False, storage_options=storage_options
)
path = handles.handle

parquet_file = self.api.ParquetFile(path, **parquet_kwargs)

result = parquet_file.to_pandas(columns=columns, **kwargs)
Expand Down Expand Up @@ -470,14 +478,18 @@ def read_parquet(

use_nullable_dtypes : bool, default False
If True, use dtypes that use ``pd.NA`` as missing value indicator
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
for the resulting DataFrame.
As new dtypes are added that support ``pd.NA`` in the future, the
output with this option will change to use those dtypes.
Note: this is an experimental option, and behaviour (e.g. additional
support dtypes) may change without notice.

.. versionadded:: 1.2.0

.. versionchanged:: 1.3.2
``use_nullable_dtypes`` now works with the the ``fastparquet`` engine
if ``fastparquet`` is version 0.7.1 or higher.

**kwargs
Any additional kwargs are passed to the engine.

Expand Down
80 changes: 49 additions & 31 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,46 @@ def test_write_column_index_nonstring(self, pa):
msg = r"parquet must have string column names"
self.check_error_on_write(df, engine, ValueError, msg)

def test_use_nullable_dtypes(self, engine):
import pyarrow.parquet as pq

if engine == "fastparquet":
pytest.importorskip(
"fastparquet",
"0.7.1",
reason="fastparquet must be 0.7.1 or higher for nullable dtype support",
)

table = pyarrow.table(
{
"a": pyarrow.array([1, 2, 3, None], "int64"),
"b": pyarrow.array([1, 2, 3, None], "uint8"),
"c": pyarrow.array(["a", "b", "c", None]),
"d": pyarrow.array([True, False, True, None]),
}
)
with tm.ensure_clean() as path:
# write manually with pyarrow to write integers
pq.write_table(table, path)
result1 = read_parquet(path, engine=engine)
result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True)

assert result1["a"].dtype == np.dtype("float64")
expected = pd.DataFrame(
{
"a": pd.array([1, 2, 3, None], dtype="Int64"),
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
"c": pd.array(["a", "b", "c", None], dtype="string"),
"d": pd.array([True, False, True, None], dtype="boolean"),
}
)
if engine == "fastparquet":
# Fastparquet doesn't support string columns yet
# Only int and boolean
result2 = result2.drop("c", axis=1)
expected = expected.drop("c", axis=1)
tm.assert_frame_equal(result2, expected)


@pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning")
class TestParquetPyArrow(Base):
Expand Down Expand Up @@ -842,35 +882,6 @@ def test_additional_extension_types(self, pa):
)
check_round_trip(df, pa)

@td.skip_if_no("pyarrow")
def test_use_nullable_dtypes(self, pa):
import pyarrow.parquet as pq

table = pyarrow.table(
{
"a": pyarrow.array([1, 2, 3, None], "int64"),
"b": pyarrow.array([1, 2, 3, None], "uint8"),
"c": pyarrow.array(["a", "b", "c", None]),
"d": pyarrow.array([True, False, True, None]),
}
)
with tm.ensure_clean() as path:
# write manually with pyarrow to write integers
pq.write_table(table, path)
result1 = read_parquet(path)
result2 = read_parquet(path, use_nullable_dtypes=True)

assert result1["a"].dtype == np.dtype("float64")
expected = pd.DataFrame(
{
"a": pd.array([1, 2, 3, None], dtype="Int64"),
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
"c": pd.array(["a", "b", "c", None], dtype="string"),
"d": pd.array([True, False, True, None], dtype="boolean"),
}
)
tm.assert_frame_equal(result2, expected)

def test_timestamp_nanoseconds(self, pa):
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
# this should work without error
Expand Down Expand Up @@ -941,7 +952,9 @@ def test_duplicate_columns(self, fp):
def test_bool_with_none(self, fp):
df = pd.DataFrame({"a": [True, None, False]})
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
check_round_trip(df, fp, expected=expected)
# Fastparquet bug in 0.7.1 makes it so that this dtype becomes
# float64
check_round_trip(df, fp, expected=expected, check_dtype=False)

def test_unsupported(self, fp):

Expand Down Expand Up @@ -1062,9 +1075,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
expected.index.name = "index"
check_round_trip(df, fp, expected=expected)

def test_use_nullable_dtypes_not_supported(self, fp):
def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp):
df = pd.DataFrame({"a": [1, 2]})

# This is supported now in fastparquet 0.7.1 and above actually
# Still need to ensure that this raises in all versions below
import fastparquet as fp

monkeypatch.setattr(fp, "__version__", "0.4")
with tm.ensure_clean() as path:
df.to_parquet(path)
with pytest.raises(ValueError, match="not supported for the fastparquet"):
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ xlrd
xlsxwriter
xlwt
odfpy
fastparquet>=0.4.0, <0.7.0
fastparquet>=0.4.0
pyarrow>=0.17.0
python-snappy
tables>=3.6.1
Expand Down