Skip to content

Commit 9c9bf7b

Browse files
committed
Backport PR pandas-dev#42919: COMPAT: Support fastparquet 0.7.1
1 parent 9549595 commit 9c9bf7b

File tree

7 files changed

+73
-43
lines changed

7 files changed

+73
-43
lines changed

ci/deps/actions-37-db.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ dependencies:
1515
- beautifulsoup4
1616
- botocore>=1.11
1717
- dask
18-
- fastparquet>=0.4.0, < 0.7.0
18+
- fastparquet>=0.4.0
1919
- fsspec>=0.7.4, <2021.6.0
2020
- gcsfs>=0.6.0
2121
- geopandas

ci/deps/azure-windows-38.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ dependencies:
1515
# pandas dependencies
1616
- blosc
1717
- bottleneck
18-
- fastparquet>=0.4.0, <0.7.0
18+
- fastparquet>=0.4.0
1919
- flask
2020
- fsspec>=0.8.0, <2021.6.0
2121
- matplotlib=3.1.3

doc/source/whatsnew/v1.3.2.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ Bug fixes
4747

4848
Other
4949
~~~~~
50-
-
50+
- :meth:`pandas.read_parquet` now supports reading nullable dtypes with ``fastparquet`` versions above 0.7.1.
5151
-
5252

5353
.. ---------------------------------------------------------------------------

environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ dependencies:
9999
- xlwt
100100
- odfpy
101101

102-
- fastparquet>=0.3.2, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet
102+
- fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet
103103
- pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
104104
- python-snappy # required by pyarrow
105105

pandas/io/parquet.py

+19-7
Original file line numberDiff line numberDiff line change
@@ -309,14 +309,21 @@ def write(
309309
def read(
310310
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
311311
):
312+
parquet_kwargs = {}
312313
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
313-
if use_nullable_dtypes:
314-
raise ValueError(
315-
"The 'use_nullable_dtypes' argument is not supported for the "
316-
"fastparquet engine"
317-
)
314+
# Technically works with 0.7.0, but was incorrect
315+
# so lets just require 0.7.1
316+
if Version(self.api.__version__) >= Version("0.7.1"):
317+
# Need to set even for use_nullable_dtypes = False,
318+
# since our defaults differ
319+
parquet_kwargs["pandas_nulls"] = use_nullable_dtypes
320+
else:
321+
if use_nullable_dtypes:
322+
raise ValueError(
323+
"The 'use_nullable_dtypes' argument is not supported for the "
324+
"fastparquet engine for fastparquet versions less than 0.7.1"
325+
)
318326
path = stringify_path(path)
319-
parquet_kwargs = {}
320327
handles = None
321328
if is_fsspec_url(path):
322329
fsspec = import_optional_dependency("fsspec")
@@ -337,6 +344,7 @@ def read(
337344
path, "rb", is_text=False, storage_options=storage_options
338345
)
339346
path = handles.handle
347+
340348
parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
341349

342350
result = parquet_file.to_pandas(columns=columns, **kwargs)
@@ -470,14 +478,18 @@ def read_parquet(
470478
471479
use_nullable_dtypes : bool, default False
472480
If True, use dtypes that use ``pd.NA`` as missing value indicator
473-
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
481+
for the resulting DataFrame.
474482
As new dtypes are added that support ``pd.NA`` in the future, the
475483
output with this option will change to use those dtypes.
476484
Note: this is an experimental option, and behaviour (e.g. additional
477485
support dtypes) may change without notice.
478486
479487
.. versionadded:: 1.2.0
480488
489+
.. versionchanged:: 1.3.2
490+
``use_nullable_dtypes`` now works with the the ``fastparquet`` engine
491+
if ``fastparquet`` is version 0.7.1 or higher.
492+
481493
**kwargs
482494
Any additional kwargs are passed to the engine.
483495

pandas/tests/io/test_parquet.py

+49-31
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,46 @@ def test_write_column_index_nonstring(self, pa):
575575
msg = r"parquet must have string column names"
576576
self.check_error_on_write(df, engine, ValueError, msg)
577577

578+
def test_use_nullable_dtypes(self, engine):
579+
import pyarrow.parquet as pq
580+
581+
if engine == "fastparquet":
582+
pytest.importorskip(
583+
"fastparquet",
584+
"0.7.1",
585+
reason="fastparquet must be 0.7.1 or higher for nullable dtype support",
586+
)
587+
588+
table = pyarrow.table(
589+
{
590+
"a": pyarrow.array([1, 2, 3, None], "int64"),
591+
"b": pyarrow.array([1, 2, 3, None], "uint8"),
592+
"c": pyarrow.array(["a", "b", "c", None]),
593+
"d": pyarrow.array([True, False, True, None]),
594+
}
595+
)
596+
with tm.ensure_clean() as path:
597+
# write manually with pyarrow to write integers
598+
pq.write_table(table, path)
599+
result1 = read_parquet(path, engine=engine)
600+
result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True)
601+
602+
assert result1["a"].dtype == np.dtype("float64")
603+
expected = pd.DataFrame(
604+
{
605+
"a": pd.array([1, 2, 3, None], dtype="Int64"),
606+
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
607+
"c": pd.array(["a", "b", "c", None], dtype="string"),
608+
"d": pd.array([True, False, True, None], dtype="boolean"),
609+
}
610+
)
611+
if engine == "fastparquet":
612+
# Fastparquet doesn't support string columns yet
613+
# Only int and boolean
614+
result2 = result2.drop("c", axis=1)
615+
expected = expected.drop("c", axis=1)
616+
tm.assert_frame_equal(result2, expected)
617+
578618

579619
@pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning")
580620
class TestParquetPyArrow(Base):
@@ -829,35 +869,6 @@ def test_additional_extension_types(self, pa):
829869
)
830870
check_round_trip(df, pa)
831871

832-
@td.skip_if_no("pyarrow")
833-
def test_use_nullable_dtypes(self, pa):
834-
import pyarrow.parquet as pq
835-
836-
table = pyarrow.table(
837-
{
838-
"a": pyarrow.array([1, 2, 3, None], "int64"),
839-
"b": pyarrow.array([1, 2, 3, None], "uint8"),
840-
"c": pyarrow.array(["a", "b", "c", None]),
841-
"d": pyarrow.array([True, False, True, None]),
842-
}
843-
)
844-
with tm.ensure_clean() as path:
845-
# write manually with pyarrow to write integers
846-
pq.write_table(table, path)
847-
result1 = read_parquet(path)
848-
result2 = read_parquet(path, use_nullable_dtypes=True)
849-
850-
assert result1["a"].dtype == np.dtype("float64")
851-
expected = pd.DataFrame(
852-
{
853-
"a": pd.array([1, 2, 3, None], dtype="Int64"),
854-
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
855-
"c": pd.array(["a", "b", "c", None], dtype="string"),
856-
"d": pd.array([True, False, True, None], dtype="boolean"),
857-
}
858-
)
859-
tm.assert_frame_equal(result2, expected)
860-
861872
def test_timestamp_nanoseconds(self, pa):
862873
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
863874
# this should work without error
@@ -928,7 +939,9 @@ def test_duplicate_columns(self, fp):
928939
def test_bool_with_none(self, fp):
929940
df = pd.DataFrame({"a": [True, None, False]})
930941
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
931-
check_round_trip(df, fp, expected=expected)
942+
# Fastparquet bug in 0.7.1 makes it so that this dtype becomes
943+
# float64
944+
check_round_trip(df, fp, expected=expected, check_dtype=False)
932945

933946
def test_unsupported(self, fp):
934947

@@ -1049,9 +1062,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
10491062
expected.index.name = "index"
10501063
check_round_trip(df, fp, expected=expected)
10511064

1052-
def test_use_nullable_dtypes_not_supported(self, fp):
1065+
def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp):
10531066
df = pd.DataFrame({"a": [1, 2]})
10541067

1068+
# This is supported now in fastparquet 0.7.1 and above actually
1069+
# Still need to ensure that this raises in all versions below
1070+
import fastparquet as fp
1071+
1072+
monkeypatch.setattr(fp, "__version__", "0.4")
10551073
with tm.ensure_clean() as path:
10561074
df.to_parquet(path)
10571075
with pytest.raises(ValueError, match="not supported for the fastparquet"):

requirements-dev.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ xlrd
6464
xlsxwriter
6565
xlwt
6666
odfpy
67-
fastparquet>=0.3.2, <0.7.0
67+
fastparquet>=0.3.2
6868
pyarrow>=0.17.0
6969
python-snappy
7070
pyqt5>=5.9.2

0 commit comments

Comments
 (0)