From 3ede160aa8f07b9a592ca52ab94c5da3fe8055e4 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 9 Oct 2016 20:05:49 -0400 Subject: [PATCH] ENH: feather support in the pandas IO api closes #13092 --- appveyor.yml | 1 + ci/install_travis.sh | 3 +- ci/requirements-2.7-64.run | 2 +- ci/requirements-2.7.sh | 7 ++ ci/requirements-3.5-64.run | 3 +- ci/requirements-3.5.run | 4 +- ci/requirements-3.5.sh | 7 ++ ci/requirements-3.5_OSX.run | 4 +- ci/requirements-3.5_OSX.sh | 7 ++ doc/source/api.rst | 9 +++ doc/source/install.rst | 1 + doc/source/io.rst | 64 +++++++++++++++++ doc/source/whatsnew/v0.20.0.txt | 3 + pandas/api/tests/test_api.py | 2 +- pandas/core/frame.py | 15 ++++ pandas/io/api.py | 1 + pandas/io/feather_format.py | 101 ++++++++++++++++++++++++++ pandas/io/tests/test_feather.py | 123 ++++++++++++++++++++++++++++++++ pandas/util/print_versions.py | 1 + 19 files changed, 348 insertions(+), 10 deletions(-) create mode 100644 ci/requirements-2.7.sh create mode 100644 ci/requirements-3.5.sh create mode 100644 ci/requirements-3.5_OSX.sh create mode 100644 pandas/io/feather_format.py create mode 100644 pandas/io/tests/test_feather.py diff --git a/appveyor.yml b/appveyor.yml index 84c34b34626b9..a8e5218ab2c9f 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -80,6 +80,7 @@ install: - cmd: conda config --set ssl_verify false # add the pandas channel *before* defaults to have defaults take priority + - cmd: conda config --add channels conda-forge - cmd: conda config --add channels pandas - cmd: conda config --remove channels defaults - cmd: conda config --add channels defaults diff --git a/ci/install_travis.sh b/ci/install_travis.sh index f35e216550a2d..542d22d9fa871 100755 --- a/ci/install_travis.sh +++ b/ci/install_travis.sh @@ -71,7 +71,8 @@ else conda config --set always_yes true --set changeps1 false || exit 1 conda update -q conda - # add the pandas channel *before* defaults to have defaults take priority + # add the pandas channel to take priority + # to add extra packages echo "add channels" conda config --add channels pandas || exit 1 conda config --remove channels defaults || exit 1 diff --git a/ci/requirements-2.7-64.run b/ci/requirements-2.7-64.run index 94472dafd565d..f953682f52d45 100644 --- a/ci/requirements-2.7-64.run +++ b/ci/requirements-2.7-64.run @@ -3,7 +3,7 @@ pytz numpy=1.10* xlwt numexpr -pytables +pytables==3.2.2 matplotlib openpyxl xlrd diff --git a/ci/requirements-2.7.sh b/ci/requirements-2.7.sh new file mode 100644 index 0000000000000..64d470e5c6e0e --- /dev/null +++ b/ci/requirements-2.7.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +source activate pandas + +echo "install 27" + +conda install -n pandas -c conda-forge feather-format diff --git a/ci/requirements-3.5-64.run b/ci/requirements-3.5-64.run index 96de21e3daa5e..905c2ff3625bd 100644 --- a/ci/requirements-3.5-64.run +++ b/ci/requirements-3.5-64.run @@ -1,11 +1,12 @@ python-dateutil pytz -numpy=1.10* +numpy openpyxl xlsxwriter xlrd xlwt scipy +feather-format numexpr pytables matplotlib diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run index 1d1cb38fd57a6..e15ca6079b4fe 100644 --- a/ci/requirements-3.5.run +++ b/ci/requirements-3.5.run @@ -18,6 +18,4 @@ pymysql psycopg2 xarray s3fs - -# incompat with conda ATM -# beautiful-soup +beautifulsoup4 diff --git a/ci/requirements-3.5.sh b/ci/requirements-3.5.sh new file mode 100644 index 0000000000000..d0f0b81802dc6 --- /dev/null +++ b/ci/requirements-3.5.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +source activate pandas + +echo "install 35" + +conda install -n pandas -c conda-forge feather-format diff --git a/ci/requirements-3.5_OSX.run b/ci/requirements-3.5_OSX.run index eceb2f9cdcebc..1d83474d10f2f 100644 --- a/ci/requirements-3.5_OSX.run +++ b/ci/requirements-3.5_OSX.run @@ -13,6 +13,4 @@ jinja2 bottleneck xarray s3fs - -# incompat with conda ATM -# beautiful-soup +beautifulsoup4 diff --git a/ci/requirements-3.5_OSX.sh b/ci/requirements-3.5_OSX.sh new file mode 100644 index 0000000000000..cfbd2882a8a2d --- /dev/null +++ b/ci/requirements-3.5_OSX.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +source activate pandas + +echo "install 35_OSX" + +conda install -n pandas -c conda-forge feather-format diff --git a/doc/source/api.rst b/doc/source/api.rst index b8157929bd940..272dfe72eafe7 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -83,6 +83,14 @@ HDFStore: PyTables (HDF5) HDFStore.get HDFStore.select +Feather +~~~~~~~ + +.. autosummary:: + :toctree: generated/ + + read_feather + SAS ~~~ @@ -1015,6 +1023,7 @@ Serialization / IO / Conversion DataFrame.to_excel DataFrame.to_json DataFrame.to_html + DataFrame.to_feather DataFrame.to_latex DataFrame.to_stata DataFrame.to_msgpack diff --git a/doc/source/install.rst b/doc/source/install.rst index f62342fa52e5c..4787b3356ee9f 100644 --- a/doc/source/install.rst +++ b/doc/source/install.rst @@ -247,6 +247,7 @@ Optional Dependencies * `SciPy `__: miscellaneous statistical functions * `xarray `__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended. * `PyTables `__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended. +* `Feather Format `__: necessary for feather-based storage, version 0.3.1 or higher. * `SQLAlchemy `__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs `__. Some common drivers are: - `psycopg2 `__: for PostgreSQL diff --git a/doc/source/io.rst b/doc/source/io.rst index 9d51d2599d668..259f9605d8313 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -34,6 +34,7 @@ object. * :ref:`read_csv` * :ref:`read_excel` * :ref:`read_hdf` + * :ref:`read_feather` * :ref:`read_sql` * :ref:`read_json` * :ref:`read_msgpack` (experimental) @@ -49,6 +50,7 @@ The corresponding ``writer`` functions are object methods that are accessed like * :ref:`to_csv` * :ref:`to_excel` * :ref:`to_hdf` + * :ref:`to_feather` * :ref:`to_sql` * :ref:`to_json` * :ref:`to_msgpack` (experimental) @@ -4152,6 +4154,68 @@ object). This cannot be changed after table creation. os.remove('store.h5') +.. _io.feather: + +Feather +------- + +.. versionadded:: 0.20.0 + +Feather provides binary columnar serialization for data frames. It is designed to make reading and writing data +frames efficient, and to make sharing data across data analysis languages easy. + +Feather is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas +dtypes, including extension dtypes such as categorical and datetime with tz. + +Several caveats. + +- This is a newer library, and the format, though stable, is not guaranteed to be backward compatible + to the earlier versions. +- The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an + error if a non-default one is provided. You can simply ``.reset_index()`` in order to store the index. +- Duplicate column names and non-string columns names are not supported +- Non supported types include ``Period`` and actual python object types. These will raise a helpful error message + on an attempt at serialization. + +See the `Full Documentation `__ + +.. ipython:: python + + df = pd.DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.Categorical(list('abc')), + 'g': pd.date_range('20130101', periods=3), + 'h': pd.date_range('20130101', periods=3, tz='US/Eastern'), + 'i': pd.date_range('20130101', periods=3, freq='ns')}) + + df + df.dtypes + +Write to a feather file. + +.. ipython:: python + + df.to_feather('example.fth) + +Read from a feather file. + +.. ipython:: python + + result = pd.read_feather('example.fth') + result + + # we preserve dtypes + result.dtypes + +.. ipython:: python + :suppress: + + import os + os.remove('example.fth') + .. _io.sql: SQL Queries diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 40bd8bc4154a6..0873e4b34b0b1 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -22,6 +22,9 @@ Check the :ref:`API Changes ` and :ref:`deprecations New features ~~~~~~~~~~~~ +- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here `. + + .. _whatsnew_0200.enhancements.dataio_dtype: diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py index bc126447213ca..b13b4d7de60ca 100644 --- a/pandas/api/tests/test_api.py +++ b/pandas/api/tests/test_api.py @@ -95,7 +95,7 @@ class TestPDApi(Base, tm.TestCase): 'read_gbq', 'read_hdf', 'read_html', 'read_json', 'read_msgpack', 'read_pickle', 'read_sas', 'read_sql', 'read_sql_query', 'read_sql_table', 'read_stata', - 'read_table'] + 'read_table', 'read_feather'] # top-level to_* funcs funcs_to = ['to_datetime', 'to_msgpack', diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ba1e08ecc482f..d12b8af35469b 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1477,6 +1477,21 @@ def to_stata(self, fname, convert_dates=None, write_index=True, variable_labels=variable_labels) writer.write_file() + def to_feather(self, fname): + """ + write out the binary feather-format for DataFrames + + .. versionadded:: 0.20.0 + + Parameters + ---------- + fname : str + string file path + + """ + from pandas.io.feather_format import to_feather + to_feather(self, fname) + @Appender(fmt.docstring_to_string, indents=1) def to_string(self, buf=None, columns=None, col_space=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, diff --git a/pandas/io/api.py b/pandas/io/api.py index 920ece9c4c3a8..0bd86c85b4b8b 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -12,6 +12,7 @@ from pandas.io.html import read_html from pandas.io.sql import read_sql, read_sql_table, read_sql_query from pandas.io.sas.sasreader import read_sas +from pandas.io.feather_format import read_feather from pandas.io.stata import read_stata from pandas.io.pickle import read_pickle, to_pickle from pandas.io.packers import read_msgpack, to_msgpack diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py new file mode 100644 index 0000000000000..ac74ac4823613 --- /dev/null +++ b/pandas/io/feather_format.py @@ -0,0 +1,101 @@ +""" feather-format compat """ + +from distutils.version import LooseVersion +from pandas import DataFrame, RangeIndex, Int64Index +from pandas.compat import range + + +def _try_import(): + # since pandas is a dependency of feather + # we need to import on first use + + try: + import feather + except ImportError: + + # give a nice error message + raise ImportError("the feather-format library is not installed\n" + "you can install via conda\n" + "conda install feather-format -c conda-forge\n" + "or via pip\n" + "pip install feather-format\n") + + try: + feather.__version__ >= LooseVersion('0.3.1') + except AttributeError: + raise ImportError("the feather-format library must be >= " + "version 0.3.1\n" + "you can install via conda\n" + "conda install feather-format -c conda-forge" + "or via pip\n" + "pip install feather-format\n") + + return feather + + +def to_feather(df, path): + """ + Write a DataFrame to the feather-format + + Parameters + ---------- + df : DataFrame + path : string + File path + """ + if not isinstance(df, DataFrame): + raise ValueError("feather only support IO with DataFrames") + + feather = _try_import() + valid_types = {'string', 'unicode'} + + # validate index + # -------------- + + # validate that we have only a default index + # raise on anything else as we don't serialize the index + + if not isinstance(df.index, Int64Index): + raise ValueError("feather does not serializing {} " + "for the index; you can .reset_index()" + "to make the index into column(s)".format( + type(df.index))) + + if not df.index.equals(RangeIndex.from_range(range(len(df)))): + raise ValueError("feather does not serializing a non-default index " + "for the index; you can .reset_index()" + "to make the index into column(s)") + + if df.index.name is not None: + raise ValueError("feather does not serialize index meta-data on a " + "default index") + + # validate columns + # ---------------- + + # must have value column names (strings only) + if df.columns.inferred_type not in valid_types: + raise ValueError("feather must have string column names") + + feather.write_dataframe(df, path) + + +def read_feather(path): + """ + Load a feather-format object from the file path + + .. versionadded 0.20.0 + + Parameters + ---------- + path : string + File path + + Returns + ------- + type of object stored in file + + """ + + feather = _try_import() + return feather.read_dataframe(path) diff --git a/pandas/io/tests/test_feather.py b/pandas/io/tests/test_feather.py new file mode 100644 index 0000000000000..b8b85d7dbbece --- /dev/null +++ b/pandas/io/tests/test_feather.py @@ -0,0 +1,123 @@ +""" test feather-format compat """ + +import nose + +import numpy as np +import pandas as pd + +from pandas.io.feather_format import to_feather, read_feather + +try: + import feather # noqa +except ImportError: + raise nose.SkipTest('no feather-format installed') + +from feather import FeatherError +import pandas.util.testing as tm +from pandas.util.testing import assert_frame_equal, ensure_clean + + +class TestFeather(tm.TestCase): + _multiprocess_can_split_ = True + + def setUp(self): + pass + + def check_error_on_write(self, df, exc): + # check that we are raising the exception + # on writing + + def f(): + with ensure_clean() as path: + to_feather(df, path) + self.assertRaises(exc, f) + + def check_round_trip(self, df): + + with ensure_clean() as path: + to_feather(df, path) + result = read_feather(path) + assert_frame_equal(result, df) + + def test_error(self): + + for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'), + np.array([1, 2, 3])]: + self.check_error_on_write(obj, ValueError) + + def test_basic(self): + + df = pd.DataFrame({'a': list('abc'), + 'b': list(range(1, 4)), + 'c': np.arange(3, 6).astype('u1'), + 'd': np.arange(4.0, 7.0, dtype='float64'), + 'e': [True, False, True], + 'f': pd.Categorical(list('abc')), + 'g': pd.date_range('20130101', periods=3), + 'h': pd.date_range('20130101', periods=3, + tz='US/Eastern'), + 'i': pd.date_range('20130101', periods=3, + freq='ns')}) + + self.check_round_trip(df) + + def test_strided_data_issues(self): + + # strided data issuehttps://github.com/wesm/feather/issues/97 + df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('abc')) + self.check_error_on_write(df, FeatherError) + + def test_duplicate_columns(self): + + # https://github.com/wesm/feather/issues/53 + # not currently able to handle duplicate columns + df = pd.DataFrame(np.arange(12).reshape(4, 3), + columns=list('aaa')).copy() + self.check_error_on_write(df, ValueError) + + def test_stringify_columns(self): + + df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy() + self.check_error_on_write(df, ValueError) + + def test_unsupported(self): + + # period + df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)}) + self.check_error_on_write(df, ValueError) + + # non-strings + df = pd.DataFrame({'a': ['a', 1, 2.0]}) + self.check_error_on_write(df, ValueError) + + def test_write_with_index(self): + + df = pd.DataFrame({'A': [1, 2, 3]}) + self.check_round_trip(df) + + # non-default index + for index in [[2, 3, 4], + pd.date_range('20130101', periods=3), + list('abc'), + [1, 3, 4], + pd.MultiIndex.from_tuples([('a', 1), ('a', 2), + ('b', 1)]), + ]: + + df.index = index + self.check_error_on_write(df, ValueError) + + # index with meta-data + df.index = [0, 1, 2] + df.index.name = 'foo' + self.check_error_on_write(df, ValueError) + + # column multi-index + df.index = [0, 1, 2] + df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]), + self.check_error_on_write(df, ValueError) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py index 657681d4c33ce..fac76be676398 100644 --- a/pandas/util/print_versions.py +++ b/pandas/util/print_versions.py @@ -80,6 +80,7 @@ def show_versions(as_json=False): ("bottleneck", lambda mod: mod.__version__), ("tables", lambda mod: mod.__version__), ("numexpr", lambda mod: mod.__version__), + ("feather", lambda mod: mod.version.version), ("matplotlib", lambda mod: mod.__version__), ("openpyxl", lambda mod: mod.__version__), ("xlrd", lambda mod: mod.__VERSION__),