Skip to content

Backport fastparquet 0.7 compat (PR #42954 and #42919) #42987

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Aug 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions ci/deps/actions-37-db.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
- beautifulsoup4
- botocore>=1.11
- dask
- fastparquet>=0.4.0, < 0.7.0
- fastparquet>=0.4.0
- fsspec>=0.7.4, <2021.6.0
- gcsfs>=0.6.0
- geopandas
Expand All @@ -25,7 +25,7 @@ dependencies:
- flask
- nomkl
- numexpr
- numpy=1.17.*
- numpy=1.18.*
- odfpy
- openpyxl
- pandas-gbq
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/azure-windows-38.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ dependencies:
# pandas dependencies
- blosc
- bottleneck
- fastparquet>=0.4.0, <0.7.0
- fastparquet>=0.4.0
- flask
- fsspec>=0.8.0, <2021.6.0
- matplotlib=3.1.3
Expand Down
2 changes: 1 addition & 1 deletion environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ dependencies:
- xlwt
- odfpy

- fastparquet>=0.3.2, <0.7.0 # pandas.read_parquet, DataFrame.to_parquet
- fastparquet>=0.3.2 # pandas.read_parquet, DataFrame.to_parquet
- pyarrow>=0.17.0 # pandas.read_parquet, DataFrame.to_parquet, pandas.read_feather, DataFrame.to_feather
- python-snappy # required by pyarrow

Expand Down
9 changes: 7 additions & 2 deletions pandas/io/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,14 +309,17 @@ def write(
def read(
self, path, columns=None, storage_options: StorageOptions = None, **kwargs
):
parquet_kwargs: dict[str, Any] = {}
use_nullable_dtypes = kwargs.pop("use_nullable_dtypes", False)
if Version(self.api.__version__) >= Version("0.7.1"):
# We are disabling nullable dtypes for fastparquet pending discussion
parquet_kwargs["pandas_nulls"] = False
if use_nullable_dtypes:
raise ValueError(
"The 'use_nullable_dtypes' argument is not supported for the "
"fastparquet engine"
)
path = stringify_path(path)
parquet_kwargs = {}
handles = None
if is_fsspec_url(path):
fsspec = import_optional_dependency("fsspec")
Expand All @@ -337,6 +340,7 @@ def read(
path, "rb", is_text=False, storage_options=storage_options
)
path = handles.handle

parquet_file = self.api.ParquetFile(path, **parquet_kwargs)

result = parquet_file.to_pandas(columns=columns, **kwargs)
Expand Down Expand Up @@ -470,7 +474,8 @@ def read_parquet(

use_nullable_dtypes : bool, default False
If True, use dtypes that use ``pd.NA`` as missing value indicator
for the resulting DataFrame (only applicable for ``engine="pyarrow"``).
for the resulting DataFrame. (only applicable for the ``pyarrow``
engine)
As new dtypes are added that support ``pd.NA`` in the future, the
output with this option will change to use those dtypes.
Note: this is an experimental option, and behaviour (e.g. additional
Expand Down
81 changes: 50 additions & 31 deletions pandas/tests/io/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,47 @@ def test_write_column_index_nonstring(self, pa):
msg = r"parquet must have string column names"
self.check_error_on_write(df, engine, ValueError, msg)

def test_use_nullable_dtypes(self, engine):
import pyarrow.parquet as pq

if engine == "fastparquet":
# We are manually disabling fastparquet's
# nullable dtype support pending discussion
pytest.skip("Fastparquet nullable dtype support is disabled")

table = pyarrow.table(
{
"a": pyarrow.array([1, 2, 3, None], "int64"),
"b": pyarrow.array([1, 2, 3, None], "uint8"),
"c": pyarrow.array(["a", "b", "c", None]),
"d": pyarrow.array([True, False, True, None]),
# Test that nullable dtypes used even in absence of nulls
"e": pyarrow.array([1, 2, 3, 4], "int64"),
}
)
with tm.ensure_clean() as path:
# write manually with pyarrow to write integers
pq.write_table(table, path)
result1 = read_parquet(path, engine=engine)
result2 = read_parquet(path, engine=engine, use_nullable_dtypes=True)

assert result1["a"].dtype == np.dtype("float64")
expected = pd.DataFrame(
{
"a": pd.array([1, 2, 3, None], dtype="Int64"),
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
"c": pd.array(["a", "b", "c", None], dtype="string"),
"d": pd.array([True, False, True, None], dtype="boolean"),
"e": pd.array([1, 2, 3, 4], dtype="Int64"),
}
)
if engine == "fastparquet":
# Fastparquet doesn't support string columns yet
# Only int and boolean
result2 = result2.drop("c", axis=1)
expected = expected.drop("c", axis=1)
tm.assert_frame_equal(result2, expected)


@pytest.mark.filterwarnings("ignore:CategoricalBlock is deprecated:DeprecationWarning")
class TestParquetPyArrow(Base):
Expand Down Expand Up @@ -829,35 +870,6 @@ def test_additional_extension_types(self, pa):
)
check_round_trip(df, pa)

@td.skip_if_no("pyarrow")
def test_use_nullable_dtypes(self, pa):
import pyarrow.parquet as pq

table = pyarrow.table(
{
"a": pyarrow.array([1, 2, 3, None], "int64"),
"b": pyarrow.array([1, 2, 3, None], "uint8"),
"c": pyarrow.array(["a", "b", "c", None]),
"d": pyarrow.array([True, False, True, None]),
}
)
with tm.ensure_clean() as path:
# write manually with pyarrow to write integers
pq.write_table(table, path)
result1 = read_parquet(path)
result2 = read_parquet(path, use_nullable_dtypes=True)

assert result1["a"].dtype == np.dtype("float64")
expected = pd.DataFrame(
{
"a": pd.array([1, 2, 3, None], dtype="Int64"),
"b": pd.array([1, 2, 3, None], dtype="UInt8"),
"c": pd.array(["a", "b", "c", None], dtype="string"),
"d": pd.array([True, False, True, None], dtype="boolean"),
}
)
tm.assert_frame_equal(result2, expected)

def test_timestamp_nanoseconds(self, pa):
# with version 2.0, pyarrow defaults to writing the nanoseconds, so
# this should work without error
Expand Down Expand Up @@ -928,7 +940,9 @@ def test_duplicate_columns(self, fp):
def test_bool_with_none(self, fp):
df = pd.DataFrame({"a": [True, None, False]})
expected = pd.DataFrame({"a": [1.0, np.nan, 0.0]}, dtype="float16")
check_round_trip(df, fp, expected=expected)
# Fastparquet bug in 0.7.1 makes it so that this dtype becomes
# float64
check_round_trip(df, fp, expected=expected, check_dtype=False)

def test_unsupported(self, fp):

Expand Down Expand Up @@ -1049,9 +1063,14 @@ def test_timezone_aware_index(self, fp, timezone_aware_date_list):
expected.index.name = "index"
check_round_trip(df, fp, expected=expected)

def test_use_nullable_dtypes_not_supported(self, fp):
def test_use_nullable_dtypes_not_supported(self, monkeypatch, fp):
df = pd.DataFrame({"a": [1, 2]})

# This is supported now in fastparquet 0.7.1 and above actually
# Still need to ensure that this raises in all versions below
import fastparquet as fp

monkeypatch.setattr(fp, "__version__", "0.4")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This monkeypatch actually shouldn't be needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, forgot to revert this part. It's actually present in master too. Will do after this, so the bot can auto backport.

with tm.ensure_clean() as path:
df.to_parquet(path)
with pytest.raises(ValueError, match="not supported for the fastparquet"):
Expand Down
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ xlrd
xlsxwriter
xlwt
odfpy
fastparquet>=0.3.2, <0.7.0
fastparquet>=0.3.2
pyarrow>=0.17.0
python-snappy
pyqt5>=5.9.2
Expand Down