Skip to content

Commit a00202d

Browse files
ENH: update feather IO for pyarrow 0.17 / Feather V2 (#33422)
1 parent 20474d5 commit a00202d

File tree

6 files changed

+38
-17
lines changed

6 files changed

+38
-17
lines changed

doc/source/conf.py

+1
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,7 @@
416416
"python": ("https://docs.python.org/3/", None),
417417
"scipy": ("https://docs.scipy.org/doc/scipy/reference/", None),
418418
"statsmodels": ("https://www.statsmodels.org/devel/", None),
419+
"pyarrow": ("https://arrow.apache.org/docs/", None),
419420
}
420421

421422
# extlinks alias

doc/source/user_guide/io.rst

+3-5
Original file line numberDiff line numberDiff line change
@@ -4583,17 +4583,15 @@ frames efficient, and to make sharing data across data analysis languages easy.
45834583
Feather is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas
45844584
dtypes, including extension dtypes such as categorical and datetime with tz.
45854585

4586-
Several caveats.
4586+
Several caveats:
45874587

4588-
* This is a newer library, and the format, though stable, is not guaranteed to be backward compatible
4589-
to the earlier versions.
45904588
* The format will NOT write an ``Index``, or ``MultiIndex`` for the
45914589
``DataFrame`` and will raise an error if a non-default one is provided. You
45924590
can ``.reset_index()`` to store the index or ``.reset_index(drop=True)`` to
45934591
ignore it.
45944592
* Duplicate column names and non-string columns names are not supported
4595-
* Non supported types include ``Period`` and actual Python object types. These will raise a helpful error message
4596-
on an attempt at serialization.
4593+
* Actual Python objects in object dtype columns are not supported. These will
4594+
raise a helpful error message on an attempt at serialization.
45974595

45984596
See the `Full Documentation <https://github.com/wesm/feather>`__.
45994597

doc/source/whatsnew/v1.1.0.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,9 @@ Other enhancements
8888
- :class:`Series.str` now has a `fullmatch` method that matches a regular expression against the entire string in each row of the series, similar to `re.fullmatch` (:issue:`32806`).
8989
- :meth:`DataFrame.sample` will now also allow array-like and BitGenerator objects to be passed to ``random_state`` as seeds (:issue:`32503`)
9090
- :meth:`MultiIndex.union` will now raise `RuntimeWarning` if the object inside are unsortable, pass `sort=False` to suppress this warning (:issue:`33015`)
91-
-
91+
- The :meth:`DataFrame.to_feather` method now supports additional keyword
92+
arguments (e.g. to set the compression) that are added in pyarrow 0.17
93+
(:issue:`33422`).
9294

9395
.. ---------------------------------------------------------------------------
9496

pandas/core/frame.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -2063,18 +2063,24 @@ def to_stata(
20632063
writer.write_file()
20642064

20652065
@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
2066-
def to_feather(self, path) -> None:
2066+
def to_feather(self, path, **kwargs) -> None:
20672067
"""
2068-
Write out the binary feather-format for DataFrames.
2068+
Write a DataFrame to the binary Feather format.
20692069
20702070
Parameters
20712071
----------
20722072
path : str
20732073
String file path.
2074+
**kwargs :
2075+
Additional keywords passed to :func:`pyarrow.feather.write_feather`.
2076+
Starting with pyarrow 0.17, this includes the `compression`,
2077+
`compression_level`, `chunksize` and `version` keywords.
2078+
2079+
.. versionadded:: 1.1.0
20742080
"""
20752081
from pandas.io.feather_format import to_feather
20762082

2077-
to_feather(self, path)
2083+
to_feather(self, path, **kwargs)
20782084

20792085
@Appender(
20802086
"""

pandas/io/feather_format.py

+6-3
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,18 @@
77
from pandas.io.common import stringify_path
88

99

10-
def to_feather(df: DataFrame, path):
10+
def to_feather(df: DataFrame, path, **kwargs):
1111
"""
12-
Write a DataFrame to the feather-format
12+
Write a DataFrame to the binary Feather format.
1313
1414
Parameters
1515
----------
1616
df : DataFrame
1717
path : string file path, or file-like object
18+
**kwargs :
19+
Additional keywords passed to `pyarrow.feather.write_feather`.
1820
21+
.. versionadded:: 1.1.0
1922
"""
2023
import_optional_dependency("pyarrow")
2124
from pyarrow import feather
@@ -58,7 +61,7 @@ def to_feather(df: DataFrame, path):
5861
if df.columns.inferred_type not in valid_types:
5962
raise ValueError("feather must have string column names")
6063

61-
feather.write_feather(df, path)
64+
feather.write_feather(df, path, **kwargs)
6265

6366

6467
def read_feather(path, columns=None, use_threads: bool = True):

pandas/tests/io/test_feather.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
import numpy as np
55
import pytest
66

7+
import pandas.util._test_decorators as td
8+
79
import pandas as pd
810
import pandas._testing as tm
911

@@ -27,15 +29,15 @@ def check_error_on_write(self, df, exc):
2729
with tm.ensure_clean() as path:
2830
to_feather(df, path)
2931

30-
def check_round_trip(self, df, expected=None, **kwargs):
32+
def check_round_trip(self, df, expected=None, write_kwargs={}, **read_kwargs):
3133

3234
if expected is None:
3335
expected = df
3436

3537
with tm.ensure_clean() as path:
36-
to_feather(df, path)
38+
to_feather(df, path, **write_kwargs)
3739

38-
result = read_feather(path, **kwargs)
40+
result = read_feather(path, **read_kwargs)
3941
tm.assert_frame_equal(result, expected)
4042

4143
def test_error(self):
@@ -71,6 +73,10 @@ def test_basic(self):
7173
"dtns": pd.date_range("20130101", periods=3, freq="ns"),
7274
}
7375
)
76+
if pyarrow_version >= LooseVersion("0.16.1.dev"):
77+
df["periods"] = pd.period_range("2013", freq="M", periods=3)
78+
df["timedeltas"] = pd.timedelta_range("1 day", periods=3)
79+
df["intervals"] = pd.interval_range(0, 3, 3)
7480

7581
assert df.dttz.dtype.tz.zone == "US/Eastern"
7682
self.check_round_trip(df)
@@ -102,8 +108,8 @@ def test_read_columns(self):
102108

103109
def test_unsupported_other(self):
104110

105-
# period
106-
df = pd.DataFrame({"a": pd.period_range("2013", freq="M", periods=3)})
111+
# mixed python objects
112+
df = pd.DataFrame({"a": ["a", 1, 2.0]})
107113
# Some versions raise ValueError, others raise ArrowInvalid.
108114
self.check_error_on_write(df, Exception)
109115

@@ -148,3 +154,8 @@ def test_path_localpath(self):
148154
df = tm.makeDataFrame().reset_index()
149155
result = tm.round_trip_localpath(df.to_feather, pd.read_feather)
150156
tm.assert_frame_equal(df, result)
157+
158+
@td.skip_if_no("pyarrow", min_version="0.16.1.dev")
159+
def test_passthrough_keywords(self):
160+
df = tm.makeDataFrame().reset_index()
161+
self.check_round_trip(df, write_kwargs=dict(version=1))

0 commit comments

Comments
 (0)