diff --git a/doc/source/conf.py b/doc/source/conf.py index d24483abd28e1..d2404b757ca11 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -416,6 +416,7 @@ "python": ("https://docs.python.org/3/", None), "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), "statsmodels": ("https://www.statsmodels.org/devel/", None), + "pyarrow": ("https://arrow.apache.org/docs/", None), } # extlinks alias diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d721e00a0a0b6..f2152c43ceaba 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -4583,17 +4583,15 @@ frames efficient, and to make sharing data across data analysis languages easy. Feather is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas dtypes, including extension dtypes such as categorical and datetime with tz. -Several caveats. +Several caveats: -* This is a newer library, and the format, though stable, is not guaranteed to be backward compatible - to the earlier versions. * The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an error if a non-default one is provided. You can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to ignore it. * Duplicate column names and non-string columns names are not supported -* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message - on an attempt at serialization. +* Actual Python objects in object dtype columns are not supported. These will + raise a helpful error message on an attempt at serialization. See the `Full Documentation `__. diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8a7db87b75d7b..0ba845aa06489 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -88,7 +88,9 @@ Other enhancements - :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`). - :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`) - :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`) -- +- The :meth:`DataFrame.to_feather` method now supports additional keyword + arguments (e.g. to set the compression) that are added in pyarrow 0.17 + (:issue:`33422`). .. --------------------------------------------------------------------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index aedbba755227d..6f0f8f881933b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2058,18 +2058,24 @@ def to_stata( writer.write_file() @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") - def to_feather(self, path) -> None: + def to_feather(self, path, **kwargs) -> None: """ - Write out the binary feather-format for DataFrames. + Write a DataFrame to the binary Feather format. Parameters ---------- path : str String file path. + **kwargs : + Additional keywords passed to :func:`pyarrow.feather.write_feather`. + Starting with pyarrow 0.17, this includes the `compression`, + `compression_level`, `chunksize` and `version` keywords. + + .. versionadded:: 1.1.0 """ from pandas.io.feather_format import to_feather - to_feather(self, path) + to_feather(self, path, **kwargs) @Appender( """ diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 5d4925620e75f..cd7045e7f2d2e 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -7,15 +7,18 @@ from pandas.io.common import stringify_path -def to_feather(df: DataFrame, path): +def to_feather(df: DataFrame, path, **kwargs): """ - Write a DataFrame to the feather-format + Write a DataFrame to the binary Feather format. Parameters ---------- df : DataFrame path : string file path, or file-like object + **kwargs : + Additional keywords passed to `pyarrow.feather.write_feather`. + .. versionadded:: 1.1.0 """ import_optional_dependency("pyarrow") from pyarrow import feather @@ -58,7 +61,7 @@ def to_feather(df: DataFrame, path): if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - feather.write_feather(df, path) + feather.write_feather(df, path, **kwargs) def read_feather(path, columns=None, use_threads: bool = True): diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 0038df78dd866..0755501ee6285 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -4,6 +4,8 @@ import numpy as np import pytest +import pandas.util._test_decorators as td + import pandas as pd import pandas._testing as tm @@ -27,15 +29,15 @@ def check_error_on_write(self, df, exc): with tm.ensure_clean() as path: to_feather(df, path) - def check_round_trip(self, df, expected=None, **kwargs): + def check_round_trip(self, df, expected=None, write_kwargs={}, **read_kwargs): if expected is None: expected = df with tm.ensure_clean() as path: - to_feather(df, path) + to_feather(df, path, **write_kwargs) - result = read_feather(path, **kwargs) + result = read_feather(path, **read_kwargs) tm.assert_frame_equal(result, expected) def test_error(self): @@ -71,6 +73,10 @@ def test_basic(self): "dtns": pd.date_range("20130101", periods=3, freq="ns"), } ) + if pyarrow_version >= LooseVersion("0.16.1.dev"): + df["periods"] = pd.period_range("2013", freq="M", periods=3) + df["timedeltas"] = pd.timedelta_range("1 day", periods=3) + df["intervals"] = pd.interval_range(0, 3, 3) assert df.dttz.dtype.tz.zone == "US/Eastern" self.check_round_trip(df) @@ -102,8 +108,8 @@ def test_read_columns(self): def test_unsupported_other(self): - # period - df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)}) + # mixed python objects + df = pd.DataFrame({"a": ["a", 1, 2.0]}) # Some versions raise ValueError, others raise ArrowInvalid. self.check_error_on_write(df, Exception) @@ -148,3 +154,8 @@ def test_path_localpath(self): df = tm.makeDataFrame().reset_index() result = tm.round_trip_localpath(df.to_feather, pd.read_feather) tm.assert_frame_equal(df, result) + + @td.skip_if_no("pyarrow", min_version="0.16.1.dev") + def test_passthrough_keywords(self): + df = tm.makeDataFrame().reset_index() + self.check_round_trip(df, write_kwargs=dict(version=1))