Skip to content

Commit e042219

Browse files
authored
COMPAT: Support fastparquet 0.7.1 (#42919)
1 parent 08d296f commit e042219

File tree

7 files changed

+73
-43
lines changed

7 files changed

+73
-43
lines changed

ci/deps/actions-38-db.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ dependencies:
1515
- beautifulsoup4
1616
- botocore>=1.11
1717
- dask
18-
- fastparquet>=0.4.0, < 0.7.0
18+
- fastparquet>=0.4.0
1919
- fsspec>=0.7.4, <2021.6.0
2020
- gcsfs>=0.6.0
2121
- geopandas

ci/deps/azure-windows-38.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ dependencies:
1515
# pandas dependencies
1616
- blosc
1717
- bottleneck
18-
- fastparquet>=0.4.0, <0.7.0
18+
- fastparquet>=0.4.0
1919
- flask
2020
- fsspec>=0.8.0, <2021.6.0
2121
- matplotlib=3.3.2

doc/source/whatsnew/v1.3.2.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ Bug fixes
4444

4545
Other
4646
~~~~~
47-
-
47+
- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1.
4848
-
4949

5050
.. ---------------------------------------------------------------------------

environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ dependencies:
9999
- xlwt
100100
- odfpy
101101

102-
- fastparquet>=0.4.0, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet
102+
- fastparquet>=0.4.0 # pandas.read_parquet, DataFrame.to_parquet
103103
- pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
104104
- python-snappy # required by pyarrow
105105

pandas/io/parquet.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -309,14 +309,21 @@ def write(
309309
def read(
310310
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
311311
):
312+
parquet_kwargs = {}
312313
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
313-
if use_nullable_dtypes:
314-
raise ValueError(
315-
"The 'use_nullable_dtypes' argument is not supported for the "
316-
"fastparquet engine"
317-
)
314+
# Technically works with 0.7.0, but was incorrect
315+
# so lets just require 0.7.1
316+
if Version(self.api.__version__) >= Version("0.7.1"):
317+
# Need to set even for use_nullable_dtypes = False,
318+
# since our defaults differ
319+
parquet_kwargs["pandas_nulls"] = use_nullable_dtypes
320+
else:
321+
if use_nullable_dtypes:
322+
raise ValueError(
323+
"The 'use_nullable_dtypes' argument is not supported for the "
324+
"fastparquet engine for fastparquet versions less than 0.7.1"
325+
)
318326
path = stringify_path(path)
319-
parquet_kwargs = {}
320327
handles = None
321328
if is_fsspec_url(path):
322329
fsspec = import_optional_dependency("fsspec")
@@ -337,6 +344,7 @@ def read(
337344
path, "rb", is_text=False, storage_options=storage_options
338345
)
339346
path = handles.handle
347+
340348
parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
341349

342350
result = parquet_file.to_pandas(columns=columns, **kwargs)
@@ -470,14 +478,18 @@ def read_parquet(
470478
471479
use_nullable_dtypes : bool, default False
472480
If True, use dtypes that use ``pd.NA`` as missing value indicator
473-
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
481+
for the resulting DataFrame.
474482
As new dtypes are added that support ``pd.NA`` in the future, the
475483
output with this option will change to use those dtypes.
476484
Note: this is an experimental option, and behaviour (e.g. additional
477485
support dtypes) may change without notice.
478486
479487
.. versionadded:: 1.2.0
480488
489+
.. versionchanged:: 1.3.2
490+
``use_nullable_dtypes`` now works with the the ``fastparquet`` engine
491+
if ``fastparquet`` is version 0.7.1 or higher.
492+
481493
**kwargs
482494
Any additional kwargs are passed to the engine.
483495

pandas/tests/io/test_parquet.py

+49-31
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,46 @@ def test_write_column_index_nonstring(self, pa):
596596
msg = r"parquet must have string column names"
597597
self.check_error_on_write(df, engine, ValueError, msg)
598598

599+
def test_use_nullable_dtypes(self, engine):
600+
import pyarrow.parquet as pq
601+
602+
if engine == "fastparquet":
603+
pytest.importorskip(
604+
"fastparquet",
605+
"0.7.1",
606+
reason="fastparquet must be 0.7.1 or higher for nullable dtype support",
607+
)
608+
609+
table = pyarrow.table(
610+
{
611+
"a": pyarrow.array([1, 2, 3, None], "int64"),
612+
"b": pyarrow.array([1, 2, 3, None], "uint8"),
613+
"c": pyarrow.array(["a", "b", "c", None]),
614+
"d": pyarrow.array([True, False, True, None]),
615+
}
616+
)
617+
with tm.ensure_clean() as path:
618+
# write manually with pyarrow to write integers
619+
pq.write_table(table, path)
620+
result1 = read_parquet(path, engine=engine)
621+
result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True)
622+
623+
assert result1["a"].dtype == np.dtype("float64")
624+
expected = pd.DataFrame(
625+
{
626+
"a": pd.array([1, 2, 3, None], dtype="Int64"),
627+
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
628+
"c": pd.array(["a", "b", "c", None], dtype="string"),
629+
"d": pd.array([True, False, True, None], dtype="boolean"),
630+
}
631+
)
632+
if engine == "fastparquet":
633+
# Fastparquet doesn't support string columns yet
634+
# Only int and boolean
635+
result2 = result2.drop("c", axis=1)
636+
expected = expected.drop("c", axis=1)
637+
tm.assert_frame_equal(result2, expected)
638+
599639

600640
@pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning")
601641
class TestParquetPyArrow(Base):
@@ -842,35 +882,6 @@ def test_additional_extension_types(self, pa):
842882
)
843883
check_round_trip(df, pa)
844884

845-
@td.skip_if_no("pyarrow")
846-
def test_use_nullable_dtypes(self, pa):
847-
import pyarrow.parquet as pq
848-
849-
table = pyarrow.table(
850-
{
851-
"a": pyarrow.array([1, 2, 3, None], "int64"),
852-
"b": pyarrow.array([1, 2, 3, None], "uint8"),
853-
"c": pyarrow.array(["a", "b", "c", None]),
854-
"d": pyarrow.array([True, False, True, None]),
855-
}
856-
)
857-
with tm.ensure_clean() as path:
858-
# write manually with pyarrow to write integers
859-
pq.write_table(table, path)
860-
result1 = read_parquet(path)
861-
result2 = read_parquet(path, use_nullable_dtypes=True)
862-
863-
assert result1["a"].dtype == np.dtype("float64")
864-
expected = pd.DataFrame(
865-
{
866-
"a": pd.array([1, 2, 3, None], dtype="Int64"),
867-
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
868-
"c": pd.array(["a", "b", "c", None], dtype="string"),
869-
"d": pd.array([True, False, True, None], dtype="boolean"),
870-
}
871-
)
872-
tm.assert_frame_equal(result2, expected)
873-
874885
def test_timestamp_nanoseconds(self, pa):
875886
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
876887
# this should work without error
@@ -941,7 +952,9 @@ def test_duplicate_columns(self, fp):
941952
def test_bool_with_none(self, fp):
942953
df = pd.DataFrame({"a": [True, None, False]})
943954
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
944-
check_round_trip(df, fp, expected=expected)
955+
# Fastparquet bug in 0.7.1 makes it so that this dtype becomes
956+
# float64
957+
check_round_trip(df, fp, expected=expected, check_dtype=False)
945958

946959
def test_unsupported(self, fp):
947960

@@ -1062,9 +1075,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
10621075
expected.index.name = "index"
10631076
check_round_trip(df, fp, expected=expected)
10641077

1065-
def test_use_nullable_dtypes_not_supported(self, fp):
1078+
def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp):
10661079
df = pd.DataFrame({"a": [1, 2]})
10671080

1081+
# This is supported now in fastparquet 0.7.1 and above actually
1082+
# Still need to ensure that this raises in all versions below
1083+
import fastparquet as fp
1084+
1085+
monkeypatch.setattr(fp, "__version__", "0.4")
10681086
with tm.ensure_clean() as path:
10691087
df.to_parquet(path)
10701088
with pytest.raises(ValueError, match="not supported for the fastparquet"):

requirements-dev.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ xlrd
6464
xlsxwriter
6565
xlwt
6666
odfpy
67-
fastparquet>=0.4.0, <0.7.0
67+
fastparquet>=0.4.0
6868
pyarrow>=0.17.0
6969
python-snappy
7070
tables>=3.6.1

0 commit comments

Comments
 (0)