ENH: feather support in the pandas IO api

jreback · jreback · commit 8b31e5407bec · 2016-10-26T06:30:57.000-04:00
closes #13092
diff --git a/appveyor.yml b/appveyor.yml
@@ -81,6 +81,7 @@ install:
 
   # add the pandas channel *before* defaults to have defaults take priority
   - cmd: conda config --add channels pandas
+  - cmd: conda config --add channels conda-forge
   - cmd: conda config --remove channels defaults
   - cmd: conda config --add channels defaults
   - cmd: conda install anaconda-client
diff --git a/ci/install_travis.sh b/ci/install_travis.sh
@@ -74,8 +74,11 @@ else
     conda config --set always_yes true --set changeps1 false || exit 1
     conda update -q conda
 
-    # add the pandas channel *before* defaults to have defaults take priority
+    # add the pandas channel to take priority
+    # add the conda-forge channel *before* defaults
+    # to add extra packages
     echo "add channels"
+    conda config --add channels conda-forge || exit 1
     conda config --add channels pandas || exit 1
     conda config --remove channels defaults || exit 1
     conda config --add channels defaults || exit 1
diff --git a/ci/requirements-2.7-64.run b/ci/requirements-2.7-64.run
@@ -9,6 +9,7 @@ openpyxl
 xlrd
 sqlalchemy
 lxml=3.2.1
+feather-format
 scipy
 xlsxwriter
 boto
diff --git a/ci/requirements-2.7.run b/ci/requirements-2.7.run
@@ -9,6 +9,7 @@ openpyxl=1.6.2
 xlrd=0.9.2
 sqlalchemy=0.9.6
 lxml=3.2.1
+feather-format
 scipy
 xlsxwriter=0.4.6
 boto=2.36.0
diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run
@@ -9,6 +9,7 @@ scipy
 numexpr
 pytables
 html5lib
+feather-format
 lxml
 matplotlib
 jinja2
diff --git a/ci/requirements-3.5_OSX.run b/ci/requirements-3.5_OSX.run
@@ -5,6 +5,7 @@ xlsxwriter
 xlrd
 xlwt
 numexpr
+feather-format
 pytables
 html5lib
 lxml
diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -82,6 +82,14 @@ HDFStore: PyTables (HDF5)
    HDFStore.get
    HDFStore.select
 
+Feather
+~~~~~~~
+
+.. autosummary::
+   :toctree: generated/
+
+   read_feather
+
 SAS
 ~~~
 
diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -247,6 +247,7 @@ Optional Dependencies
 * `SciPy <http://www.scipy.org>`__: miscellaneous statistical functions
 * `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
 * `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
+* `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage.
 * `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
 
     - `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -34,6 +34,7 @@ object.
     * :ref:`read_csv<io.read_csv_table>`
     * :ref:`read_excel<io.excel_reader>`
     * :ref:`read_hdf<io.hdf5>`
+    * :ref:`read_feather<io.feather>`
     * :ref:`read_sql<io.sql>`
     * :ref:`read_json<io.json_reader>`
     * :ref:`read_msgpack<io.msgpack>` (experimental)
@@ -49,6 +50,7 @@ The corresponding ``writer`` functions are object methods that are accessed like
     * :ref:`to_csv<io.store_in_csv>`
     * :ref:`to_excel<io.excel_writer>`
     * :ref:`to_hdf<io.hdf5>`
+    * :ref:`to_feather<io.feather>`
     * :ref:`to_sql<io.sql>`
     * :ref:`to_json<io.json_writer>`
     * :ref:`to_msgpack<io.msgpack>` (experimental)
@@ -4089,6 +4091,63 @@ object). This cannot be changed after table creation.
    os.remove('store.h5')
 
 
+.. _io.feather:
+
+Feather
+-------
+
+.. versionadded:: 0.19.1
+
+Feather provides binary columnar serialization for data frames. It is designed to make reading and writing data
+frames efficient, and to make sharing data across data analysis languages easy.
+
+Feather is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas
+dtypes, including extension dtypes such as categorical and datetime with tz.
+
+Several caveats.
+
+- This is a newer library, and the format, though stable, is not guaranteed to be backward compatible
+  to the earlier versions.
+- The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an
+  error if a non-default one is provided. You can simply ``.reset_index()`` in order to store the index.
+- Non supported types include ``Period`` and actual python object types. These will raise a helpful error message
+  on an attempt at serialization.
+
+See the `Full Documentation <https://github.com/wesm/feather>`__
+
+.. ipython:: python
+
+   df = pd.DataFrame({'a': list('abc'),
+                      'b': list(range(1, 4)),
+                      'c': np.arange(3, 6).astype('u1'),
+                      'd': np.arange(4.0, 7.0, dtype='float64'),
+                      'e': [True, False, True],
+                      'f': pd.Categorical(list('abc')),
+                      'g': pd.date_range('20130101', periods=3),
+                      'h': pd.date_range('20130101', periods=3, tz='US/Eastern'),
+                      'g': pd.date_range('20130101', periods=3, freq='ns')})
+
+   df
+   df.dtypes
+
+Write to a feather file.
+
+.. ipython:: python
+
+   df.to_feather('example.fth)
+
+Read from a feather file.
+
+.. ipython:: python
+
+   pd.read_feather('example.fth')
+
+.. ipython:: python
+   :suppress:
+
+   import os
+   os.remove('example.fth')
+
 .. _io.sql:
 
 SQL Queries
diff --git a/doc/source/whatsnew/v0.19.1.txt b/doc/source/whatsnew/v0.19.1.txt
@@ -15,6 +15,15 @@ Highlights include:
     :backlinks: none
 
 
+.. _whatsnew_0190.new_features:
+
+New features
+~~~~~~~~~~~~
+
+- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here <io.feather>`.
+
+
+
 .. _whatsnew_0191.performance:
 
 Performance Improvements
diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py
@@ -93,7 +93,7 @@ class TestPDApi(Base, tm.TestCase):
                   'read_gbq', 'read_hdf', 'read_html', 'read_json',
                   'read_msgpack', 'read_pickle', 'read_sas', 'read_sql',
                   'read_sql_query', 'read_sql_table', 'read_stata',
-                  'read_table']
+                  'read_table', 'read_feather']
 
     # top-level to_* funcs
     funcs_to = ['to_datetime', 'to_msgpack',
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1530,6 +1530,21 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
                              variable_labels=variable_labels)
         writer.write_file()
 
+    def to_feather(self, fname):
+        """
+        write out the binary feather-format for DataFrames
+
+        .. versionadded:: 0.19.1
+
+        Parameters
+        ----------
+        fname : str
+            string file path
+
+        """
+        from pandas.io.feather_format import to_feather
+        to_feather(self, fname)
+
     @Appender(fmt.docstring_to_string, indents=1)
     def to_string(self, buf=None, columns=None, col_space=None, header=True,
                   index=True, na_rep='NaN', formatters=None, float_format=None,
diff --git a/pandas/io/api.py b/pandas/io/api.py
@@ -12,6 +12,7 @@
 from pandas.io.html import read_html
 from pandas.io.sql import read_sql, read_sql_table, read_sql_query
 from pandas.io.sas.sasreader import read_sas
+from pandas.io.feather_format import read_feather
 from pandas.io.stata import read_stata
 from pandas.io.pickle import read_pickle, to_pickle
 from pandas.io.packers import read_msgpack, to_msgpack
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -0,0 +1,110 @@
+""" feather-format compat """
+
+from pandas import DataFrame, RangeIndex, MultiIndex, Int64Index
+from pandas.types.common import is_object_dtype
+from pandas.compat import range
+from pandas.lib import infer_dtype
+
+
+def _try_import():
+    # since pandas is a dependency of feather
+    # we need to import on first use
+
+    try:
+        import feather
+    except ImportError:
+
+        # give a nice error message
+        raise ImportError("the feather-format library is not installed\n"
+                          "you can install via conda\n"
+                          "conda install feather-format -c conda-forge")
+    return feather
+
+
+def to_feather(df, path):
+    """
+    Write a DataFrame to the feather-format
+
+    Parameters
+    ----------
+    df : DataFrame
+    path : string
+        File path
+    """
+    if not isinstance(df, DataFrame):
+        raise ValueError("feather only support IO with DataFrames")
+
+    feather = _try_import()
+    valid_types = {'string', 'unicode'}
+
+    # validate index
+    # --------------
+
+    # validate that we have only a default index
+    # raise on anything else as we don't serialize the index
+
+    if not isinstance(df.index, (RangeIndex, Int64Index)):
+        raise ValueError("feather does not serializing {} "
+                         "for the index; you can .reset_index()"
+                         "to make the index into column(s)".format(
+                             type(df.index)))
+
+    if not df.index.equals(RangeIndex.from_range(range(len(df)))):
+        raise ValueError("feather does not serializing a non-default index "
+                         "for the index; you can .reset_index()"
+                         "to make the index into column(s)")
+
+    # validate columns
+    # ----------------
+
+    # must have unique column names
+    if not df.columns.is_unique:
+        raise ValueError("feather does not support duplicate columns")
+
+    # must be a Index
+    if isinstance(df.columns, MultiIndex):
+        raise ValueError("feather does not support serializing a "
+                         "MultiIndex for the columns")
+
+    # must have value column names (strings only)
+    if df.columns.inferred_type not in valid_types:
+        raise ValueError("feather must have string column names")
+
+    # validate dtypes
+    # ---------------
+
+    # validate that we do not have any non-string object dtypes
+    # as these 'work', but will not properly de-serialize
+    objects = [c for c, dtype in df.dtypes.iteritems()
+               if is_object_dtype(dtype)]
+    dtypes = [infer_dtype(df[c]) for c in objects]
+    if len(set(dtypes) - valid_types):
+        invalid = DataFrame([[i, c, dtype] for i, (c, dtype) in
+                             enumerate(zip(objects, dtypes))])
+        invalid.columns = ['ncolumn', 'column', 'inferred_dtype']
+        invalid = invalid[~invalid.inferred_dtype.isin(list(valid_types))]
+
+        msg = ("The following columns are not supported to serialize "
+               "to the feather-format:\n\n"
+               "{}".format(invalid.to_string()))
+        raise ValueError(msg)
+
+    feather.write_dataframe(df, path)
+
+
+def read_feather(path):
+    """
+    Load a feather-format object from the file path
+
+    Parameters
+    ----------
+    path : string
+        File path
+
+    Returns
+    -------
+    type of object stored in file
+    """
+
+    feather = _try_import()
+    return feather.read_dataframe(path)
diff --git a/pandas/io/tests/test_feather.py b/pandas/io/tests/test_feather.py
diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py