Skip to content

Commit 255b796

Browse files
authored
Backport fastparquet 0.7 compat (PR pandas-dev#42954 and pandas-dev#42919) (pandas-dev#42987)
1 parent 0130d77 commit 255b796

File tree

6 files changed

+62
-38
lines changed

6 files changed

+62
-38
lines changed

ci/deps/actions-37-db.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ dependencies:
1515
- beautifulsoup4
1616
- botocore>=1.11
1717
- dask
18-
- fastparquet>=0.4.0, < 0.7.0
18+
- fastparquet>=0.4.0
1919
- fsspec>=0.7.4, <2021.6.0
2020
- gcsfs>=0.6.0
2121
- geopandas
@@ -25,7 +25,7 @@ dependencies:
2525
- flask
2626
- nomkl
2727
- numexpr
28-
- numpy=1.17.*
28+
- numpy=1.18.*
2929
- odfpy
3030
- openpyxl
3131
- pandas-gbq

ci/deps/azure-windows-38.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ dependencies:
1515
# pandas dependencies
1616
- blosc
1717
- bottleneck
18-
- fastparquet>=0.4.0, <0.7.0
18+
- fastparquet>=0.4.0
1919
- flask
2020
- fsspec>=0.8.0, <2021.6.0
2121
- matplotlib=3.1.3

environment.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ dependencies:
9999
- xlwt
100100
- odfpy
101101

102-
- fastparquet>=0.3.2, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet
102+
- fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet
103103
- pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
104104
- python-snappy # required by pyarrow
105105

pandas/io/parquet.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -309,14 +309,17 @@ def write(
309309
def read(
310310
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
311311
):
312+
parquet_kwargs: dict[str, Any] = {}
312313
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
314+
if Version(self.api.__version__) >= Version("0.7.1"):
315+
# We are disabling nullable dtypes for fastparquet pending discussion
316+
parquet_kwargs["pandas_nulls"] = False
313317
if use_nullable_dtypes:
314318
raise ValueError(
315319
"The 'use_nullable_dtypes' argument is not supported for the "
316320
"fastparquet engine"
317321
)
318322
path = stringify_path(path)
319-
parquet_kwargs = {}
320323
handles = None
321324
if is_fsspec_url(path):
322325
fsspec = import_optional_dependency("fsspec")
@@ -337,6 +340,7 @@ def read(
337340
path, "rb", is_text=False, storage_options=storage_options
338341
)
339342
path = handles.handle
343+
340344
parquet_file = self.api.ParquetFile(path, **parquet_kwargs)
341345

342346
result = parquet_file.to_pandas(columns=columns, **kwargs)
@@ -470,7 +474,8 @@ def read_parquet(
470474
471475
use_nullable_dtypes : bool, default False
472476
If True, use dtypes that use ``pd.NA`` as missing value indicator
473-
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
477+
for the resulting DataFrame. (only applicable for the ``pyarrow``
478+
engine)
474479
As new dtypes are added that support ``pd.NA`` in the future, the
475480
output with this option will change to use those dtypes.
476481
Note: this is an experimental option, and behaviour (e.g. additional

pandas/tests/io/test_parquet.py

+50-31
Original file line numberDiff line numberDiff line change
@@ -575,6 +575,47 @@ def test_write_column_index_nonstring(self, pa):
575575
msg = r"parquet must have string column names"
576576
self.check_error_on_write(df, engine, ValueError, msg)
577577

578+
def test_use_nullable_dtypes(self, engine):
579+
import pyarrow.parquet as pq
580+
581+
if engine == "fastparquet":
582+
# We are manually disabling fastparquet's
583+
# nullable dtype support pending discussion
584+
pytest.skip("Fastparquet nullable dtype support is disabled")
585+
586+
table = pyarrow.table(
587+
{
588+
"a": pyarrow.array([1, 2, 3, None], "int64"),
589+
"b": pyarrow.array([1, 2, 3, None], "uint8"),
590+
"c": pyarrow.array(["a", "b", "c", None]),
591+
"d": pyarrow.array([True, False, True, None]),
592+
# Test that nullable dtypes used even in absence of nulls
593+
"e": pyarrow.array([1, 2, 3, 4], "int64"),
594+
}
595+
)
596+
with tm.ensure_clean() as path:
597+
# write manually with pyarrow to write integers
598+
pq.write_table(table, path)
599+
result1 = read_parquet(path, engine=engine)
600+
result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True)
601+
602+
assert result1["a"].dtype == np.dtype("float64")
603+
expected = pd.DataFrame(
604+
{
605+
"a": pd.array([1, 2, 3, None], dtype="Int64"),
606+
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
607+
"c": pd.array(["a", "b", "c", None], dtype="string"),
608+
"d": pd.array([True, False, True, None], dtype="boolean"),
609+
"e": pd.array([1, 2, 3, 4], dtype="Int64"),
610+
}
611+
)
612+
if engine == "fastparquet":
613+
# Fastparquet doesn't support string columns yet
614+
# Only int and boolean
615+
result2 = result2.drop("c", axis=1)
616+
expected = expected.drop("c", axis=1)
617+
tm.assert_frame_equal(result2, expected)
618+
578619

579620
@pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning")
580621
class TestParquetPyArrow(Base):
@@ -829,35 +870,6 @@ def test_additional_extension_types(self, pa):
829870
)
830871
check_round_trip(df, pa)
831872

832-
@td.skip_if_no("pyarrow")
833-
def test_use_nullable_dtypes(self, pa):
834-
import pyarrow.parquet as pq
835-
836-
table = pyarrow.table(
837-
{
838-
"a": pyarrow.array([1, 2, 3, None], "int64"),
839-
"b": pyarrow.array([1, 2, 3, None], "uint8"),
840-
"c": pyarrow.array(["a", "b", "c", None]),
841-
"d": pyarrow.array([True, False, True, None]),
842-
}
843-
)
844-
with tm.ensure_clean() as path:
845-
# write manually with pyarrow to write integers
846-
pq.write_table(table, path)
847-
result1 = read_parquet(path)
848-
result2 = read_parquet(path, use_nullable_dtypes=True)
849-
850-
assert result1["a"].dtype == np.dtype("float64")
851-
expected = pd.DataFrame(
852-
{
853-
"a": pd.array([1, 2, 3, None], dtype="Int64"),
854-
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
855-
"c": pd.array(["a", "b", "c", None], dtype="string"),
856-
"d": pd.array([True, False, True, None], dtype="boolean"),
857-
}
858-
)
859-
tm.assert_frame_equal(result2, expected)
860-
861873
def test_timestamp_nanoseconds(self, pa):
862874
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
863875
# this should work without error
@@ -928,7 +940,9 @@ def test_duplicate_columns(self, fp):
928940
def test_bool_with_none(self, fp):
929941
df = pd.DataFrame({"a": [True, None, False]})
930942
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
931-
check_round_trip(df, fp, expected=expected)
943+
# Fastparquet bug in 0.7.1 makes it so that this dtype becomes
944+
# float64
945+
check_round_trip(df, fp, expected=expected, check_dtype=False)
932946

933947
def test_unsupported(self, fp):
934948

@@ -1049,9 +1063,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
10491063
expected.index.name = "index"
10501064
check_round_trip(df, fp, expected=expected)
10511065

1052-
def test_use_nullable_dtypes_not_supported(self, fp):
1066+
def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp):
10531067
df = pd.DataFrame({"a": [1, 2]})
10541068

1069+
# This is supported now in fastparquet 0.7.1 and above actually
1070+
# Still need to ensure that this raises in all versions below
1071+
import fastparquet as fp
1072+
1073+
monkeypatch.setattr(fp, "__version__", "0.4")
10551074
with tm.ensure_clean() as path:
10561075
df.to_parquet(path)
10571076
with pytest.raises(ValueError, match="not supported for the fastparquet"):

requirements-dev.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ xlrd
6464
xlsxwriter
6565
xlwt
6666
odfpy
67-
fastparquet>=0.3.2, <0.7.0
67+
fastparquet>=0.3.2
6868
pyarrow>=0.17.0
6969
python-snappy
7070
pyqt5>=5.9.2

0 commit comments

Comments
 (0)