From 3ede160aa8f07b9a592ca52ab94c5da3fe8055e4 Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Sun, 9 Oct 2016 20:05:49 -0400
Subject: [PATCH] ENH: feather support in the pandas IO api

closes #13092
---
 appveyor.yml                    |   1 +
 ci/install_travis.sh            |   3 +-
 ci/requirements-2.7-64.run      |   2 +-
 ci/requirements-2.7.sh          |   7 ++
 ci/requirements-3.5-64.run      |   3 +-
 ci/requirements-3.5.run         |   4 +-
 ci/requirements-3.5.sh          |   7 ++
 ci/requirements-3.5_OSX.run     |   4 +-
 ci/requirements-3.5_OSX.sh      |   7 ++
 doc/source/api.rst              |   9 +++
 doc/source/install.rst          |   1 +
 doc/source/io.rst               |  64 +++++++++++++++++
 doc/source/whatsnew/v0.20.0.txt |   3 +
 pandas/api/tests/test_api.py    |   2 +-
 pandas/core/frame.py            |  15 ++++
 pandas/io/api.py                |   1 +
 pandas/io/feather_format.py     | 101 ++++++++++++++++++++++++++
 pandas/io/tests/test_feather.py | 123 ++++++++++++++++++++++++++++++++
 pandas/util/print_versions.py   |   1 +
 19 files changed, 348 insertions(+), 10 deletions(-)
 create mode 100644 ci/requirements-2.7.sh
 create mode 100644 ci/requirements-3.5.sh
 create mode 100644 ci/requirements-3.5_OSX.sh
 create mode 100644 pandas/io/feather_format.py
 create mode 100644 pandas/io/tests/test_feather.py

diff --git a/appveyor.yml b/appveyor.yml
index 84c34b34626b9..a8e5218ab2c9f 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -80,6 +80,7 @@ install:
   - cmd: conda config --set ssl_verify false
 
   # add the pandas channel *before* defaults to have defaults take priority
+  - cmd: conda config --add channels conda-forge
   - cmd: conda config --add channels pandas
   - cmd: conda config --remove channels defaults
   - cmd: conda config --add channels defaults
diff --git a/ci/install_travis.sh b/ci/install_travis.sh
index f35e216550a2d..542d22d9fa871 100755
--- a/ci/install_travis.sh
+++ b/ci/install_travis.sh
@@ -71,7 +71,8 @@ else
     conda config --set always_yes true --set changeps1 false || exit 1
     conda update -q conda
 
-    # add the pandas channel *before* defaults to have defaults take priority
+    # add the pandas channel to take priority
+    # to add extra packages
     echo "add channels"
     conda config --add channels pandas || exit 1
     conda config --remove channels defaults || exit 1
diff --git a/ci/requirements-2.7-64.run b/ci/requirements-2.7-64.run
index 94472dafd565d..f953682f52d45 100644
--- a/ci/requirements-2.7-64.run
+++ b/ci/requirements-2.7-64.run
@@ -3,7 +3,7 @@ pytz
 numpy=1.10*
 xlwt
 numexpr
-pytables
+pytables==3.2.2
 matplotlib
 openpyxl
 xlrd
diff --git a/ci/requirements-2.7.sh b/ci/requirements-2.7.sh
new file mode 100644
index 0000000000000..64d470e5c6e0e
--- /dev/null
+++ b/ci/requirements-2.7.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+source activate pandas
+
+echo "install 27"
+
+conda install -n pandas -c conda-forge feather-format
diff --git a/ci/requirements-3.5-64.run b/ci/requirements-3.5-64.run
index 96de21e3daa5e..905c2ff3625bd 100644
--- a/ci/requirements-3.5-64.run
+++ b/ci/requirements-3.5-64.run
@@ -1,11 +1,12 @@
 python-dateutil
 pytz
-numpy=1.10*
+numpy
 openpyxl
 xlsxwriter
 xlrd
 xlwt
 scipy
+feather-format
 numexpr
 pytables
 matplotlib
diff --git a/ci/requirements-3.5.run b/ci/requirements-3.5.run
index 1d1cb38fd57a6..e15ca6079b4fe 100644
--- a/ci/requirements-3.5.run
+++ b/ci/requirements-3.5.run
@@ -18,6 +18,4 @@ pymysql
 psycopg2
 xarray
 s3fs
-
-# incompat with conda ATM
-# beautiful-soup
+beautifulsoup4
diff --git a/ci/requirements-3.5.sh b/ci/requirements-3.5.sh
new file mode 100644
index 0000000000000..d0f0b81802dc6
--- /dev/null
+++ b/ci/requirements-3.5.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+source activate pandas
+
+echo "install 35"
+
+conda install -n pandas -c conda-forge feather-format
diff --git a/ci/requirements-3.5_OSX.run b/ci/requirements-3.5_OSX.run
index eceb2f9cdcebc..1d83474d10f2f 100644
--- a/ci/requirements-3.5_OSX.run
+++ b/ci/requirements-3.5_OSX.run
@@ -13,6 +13,4 @@ jinja2
 bottleneck
 xarray
 s3fs
-
-# incompat with conda ATM
-# beautiful-soup
+beautifulsoup4
diff --git a/ci/requirements-3.5_OSX.sh b/ci/requirements-3.5_OSX.sh
new file mode 100644
index 0000000000000..cfbd2882a8a2d
--- /dev/null
+++ b/ci/requirements-3.5_OSX.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+source activate pandas
+
+echo "install 35_OSX"
+
+conda install -n pandas -c conda-forge feather-format
diff --git a/doc/source/api.rst b/doc/source/api.rst
index b8157929bd940..272dfe72eafe7 100644
--- a/doc/source/api.rst
+++ b/doc/source/api.rst
@@ -83,6 +83,14 @@ HDFStore: PyTables (HDF5)
    HDFStore.get
    HDFStore.select
 
+Feather
+~~~~~~~
+
+.. autosummary::
+   :toctree: generated/
+
+   read_feather
+
 SAS
 ~~~
 
@@ -1015,6 +1023,7 @@ Serialization / IO / Conversion
    DataFrame.to_excel
    DataFrame.to_json
    DataFrame.to_html
+   DataFrame.to_feather
    DataFrame.to_latex
    DataFrame.to_stata
    DataFrame.to_msgpack
diff --git a/doc/source/install.rst b/doc/source/install.rst
index f62342fa52e5c..4787b3356ee9f 100644
--- a/doc/source/install.rst
+++ b/doc/source/install.rst
@@ -247,6 +247,7 @@ Optional Dependencies
 * `SciPy <http://www.scipy.org>`__: miscellaneous statistical functions
 * `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
 * `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
+* `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
 * `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
 
     - `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL
diff --git a/doc/source/io.rst b/doc/source/io.rst
index 9d51d2599d668..259f9605d8313 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -34,6 +34,7 @@ object.
     * :ref:`read_csv<io.read_csv_table>`
     * :ref:`read_excel<io.excel_reader>`
     * :ref:`read_hdf<io.hdf5>`
+    * :ref:`read_feather<io.feather>`
     * :ref:`read_sql<io.sql>`
     * :ref:`read_json<io.json_reader>`
     * :ref:`read_msgpack<io.msgpack>` (experimental)
@@ -49,6 +50,7 @@ The corresponding ``writer`` functions are object methods that are accessed like
     * :ref:`to_csv<io.store_in_csv>`
     * :ref:`to_excel<io.excel_writer>`
     * :ref:`to_hdf<io.hdf5>`
+    * :ref:`to_feather<io.feather>`
     * :ref:`to_sql<io.sql>`
     * :ref:`to_json<io.json_writer>`
     * :ref:`to_msgpack<io.msgpack>` (experimental)
@@ -4152,6 +4154,68 @@ object). This cannot be changed after table creation.
    os.remove('store.h5')
 
 
+.. _io.feather:
+
+Feather
+-------
+
+.. versionadded:: 0.20.0
+
+Feather provides binary columnar serialization for data frames. It is designed to make reading and writing data
+frames efficient, and to make sharing data across data analysis languages easy.
+
+Feather is designed to faithfully serialize and de-serialize DataFrames, supporting all of the pandas
+dtypes, including extension dtypes such as categorical and datetime with tz.
+
+Several caveats.
+
+- This is a newer library, and the format, though stable, is not guaranteed to be backward compatible
+  to the earlier versions.
+- The format will NOT write an ``Index``, or ``MultiIndex`` for the ``DataFrame`` and will raise an
+  error if a non-default one is provided. You can simply ``.reset_index()`` in order to store the index.
+- Duplicate column names and non-string columns names are not supported
+- Non supported types include ``Period`` and actual python object types. These will raise a helpful error message
+  on an attempt at serialization.
+
+See the `Full Documentation <https://github.com/wesm/feather>`__
+
+.. ipython:: python
+
+   df = pd.DataFrame({'a': list('abc'),
+                      'b': list(range(1, 4)),
+                      'c': np.arange(3, 6).astype('u1'),
+                      'd': np.arange(4.0, 7.0, dtype='float64'),
+                      'e': [True, False, True],
+                      'f': pd.Categorical(list('abc')),
+                      'g': pd.date_range('20130101', periods=3),
+                      'h': pd.date_range('20130101', periods=3, tz='US/Eastern'),
+                      'i': pd.date_range('20130101', periods=3, freq='ns')})
+
+   df
+   df.dtypes
+
+Write to a feather file.
+
+.. ipython:: python
+
+   df.to_feather('example.fth)
+
+Read from a feather file.
+
+.. ipython:: python
+
+   result = pd.read_feather('example.fth')
+   result
+
+   # we preserve dtypes
+   result.dtypes
+
+.. ipython:: python
+   :suppress:
+
+   import os
+   os.remove('example.fth')
+
 .. _io.sql:
 
 SQL Queries
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 40bd8bc4154a6..0873e4b34b0b1 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -22,6 +22,9 @@ Check the :ref:`API Changes <whatsnew_0200.api_breaking>` and :ref:`deprecations
 New features
 ~~~~~~~~~~~~
 
+- Integration with the ``feather-format``, including a new top-level ``pd.read_feather()`` and ``DataFrame.to_feather()`` method, see :ref:`here <io.feather>`.
+
+
 
 .. _whatsnew_0200.enhancements.dataio_dtype:
 
diff --git a/pandas/api/tests/test_api.py b/pandas/api/tests/test_api.py
index bc126447213ca..b13b4d7de60ca 100644
--- a/pandas/api/tests/test_api.py
+++ b/pandas/api/tests/test_api.py
@@ -95,7 +95,7 @@ class TestPDApi(Base, tm.TestCase):
                   'read_gbq', 'read_hdf', 'read_html', 'read_json',
                   'read_msgpack', 'read_pickle', 'read_sas', 'read_sql',
                   'read_sql_query', 'read_sql_table', 'read_stata',
-                  'read_table']
+                  'read_table', 'read_feather']
 
     # top-level to_* funcs
     funcs_to = ['to_datetime', 'to_msgpack',
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index ba1e08ecc482f..d12b8af35469b 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -1477,6 +1477,21 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
                              variable_labels=variable_labels)
         writer.write_file()
 
+    def to_feather(self, fname):
+        """
+        write out the binary feather-format for DataFrames
+
+        .. versionadded:: 0.20.0
+
+        Parameters
+        ----------
+        fname : str
+            string file path
+
+        """
+        from pandas.io.feather_format import to_feather
+        to_feather(self, fname)
+
     @Appender(fmt.docstring_to_string, indents=1)
     def to_string(self, buf=None, columns=None, col_space=None, header=True,
                   index=True, na_rep='NaN', formatters=None, float_format=None,
diff --git a/pandas/io/api.py b/pandas/io/api.py
index 920ece9c4c3a8..0bd86c85b4b8b 100644
--- a/pandas/io/api.py
+++ b/pandas/io/api.py
@@ -12,6 +12,7 @@
 from pandas.io.html import read_html
 from pandas.io.sql import read_sql, read_sql_table, read_sql_query
 from pandas.io.sas.sasreader import read_sas
+from pandas.io.feather_format import read_feather
 from pandas.io.stata import read_stata
 from pandas.io.pickle import read_pickle, to_pickle
 from pandas.io.packers import read_msgpack, to_msgpack
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
new file mode 100644
index 0000000000000..ac74ac4823613
--- /dev/null
+++ b/pandas/io/feather_format.py
@@ -0,0 +1,101 @@
+""" feather-format compat """
+
+from distutils.version import LooseVersion
+from pandas import DataFrame, RangeIndex, Int64Index
+from pandas.compat import range
+
+
+def _try_import():
+    # since pandas is a dependency of feather
+    # we need to import on first use
+
+    try:
+        import feather
+    except ImportError:
+
+        # give a nice error message
+        raise ImportError("the feather-format library is not installed\n"
+                          "you can install via conda\n"
+                          "conda install feather-format -c conda-forge\n"
+                          "or via pip\n"
+                          "pip install feather-format\n")
+
+    try:
+        feather.__version__ >= LooseVersion('0.3.1')
+    except AttributeError:
+        raise ImportError("the feather-format library must be >= "
+                          "version 0.3.1\n"
+                          "you can install via conda\n"
+                          "conda install feather-format -c conda-forge"
+                          "or via pip\n"
+                          "pip install feather-format\n")
+
+    return feather
+
+
+def to_feather(df, path):
+    """
+    Write a DataFrame to the feather-format
+
+    Parameters
+    ----------
+    df : DataFrame
+    path : string
+        File path
+    """
+    if not isinstance(df, DataFrame):
+        raise ValueError("feather only support IO with DataFrames")
+
+    feather = _try_import()
+    valid_types = {'string', 'unicode'}
+
+    # validate index
+    # --------------
+
+    # validate that we have only a default index
+    # raise on anything else as we don't serialize the index
+
+    if not isinstance(df.index, Int64Index):
+        raise ValueError("feather does not serializing {} "
+                         "for the index; you can .reset_index()"
+                         "to make the index into column(s)".format(
+                             type(df.index)))
+
+    if not df.index.equals(RangeIndex.from_range(range(len(df)))):
+        raise ValueError("feather does not serializing a non-default index "
+                         "for the index; you can .reset_index()"
+                         "to make the index into column(s)")
+
+    if df.index.name is not None:
+        raise ValueError("feather does not serialize index meta-data on a "
+                         "default index")
+
+    # validate columns
+    # ----------------
+
+    # must have value column names (strings only)
+    if df.columns.inferred_type not in valid_types:
+        raise ValueError("feather must have string column names")
+
+    feather.write_dataframe(df, path)
+
+
+def read_feather(path):
+    """
+    Load a feather-format object from the file path
+
+    .. versionadded 0.20.0
+
+    Parameters
+    ----------
+    path : string
+        File path
+
+    Returns
+    -------
+    type of object stored in file
+
+    """
+
+    feather = _try_import()
+    return feather.read_dataframe(path)
diff --git a/pandas/io/tests/test_feather.py b/pandas/io/tests/test_feather.py
new file mode 100644
index 0000000000000..b8b85d7dbbece
--- /dev/null
+++ b/pandas/io/tests/test_feather.py
@@ -0,0 +1,123 @@
+""" test feather-format compat """
+
+import nose
+
+import numpy as np
+import pandas as pd
+
+from pandas.io.feather_format import to_feather, read_feather
+
+try:
+    import feather  # noqa
+except ImportError:
+    raise nose.SkipTest('no feather-format installed')
+
+from feather import FeatherError
+import pandas.util.testing as tm
+from pandas.util.testing import assert_frame_equal, ensure_clean
+
+
+class TestFeather(tm.TestCase):
+    _multiprocess_can_split_ = True
+
+    def setUp(self):
+        pass
+
+    def check_error_on_write(self, df, exc):
+        # check that we are raising the exception
+        # on writing
+
+        def f():
+            with ensure_clean() as path:
+                to_feather(df, path)
+        self.assertRaises(exc, f)
+
+    def check_round_trip(self, df):
+
+        with ensure_clean() as path:
+            to_feather(df, path)
+            result = read_feather(path)
+            assert_frame_equal(result, df)
+
+    def test_error(self):
+
+        for obj in [pd.Series([1, 2, 3]), 1, 'foo', pd.Timestamp('20130101'),
+                    np.array([1, 2, 3])]:
+            self.check_error_on_write(obj, ValueError)
+
+    def test_basic(self):
+
+        df = pd.DataFrame({'a': list('abc'),
+                           'b': list(range(1, 4)),
+                           'c': np.arange(3, 6).astype('u1'),
+                           'd': np.arange(4.0, 7.0, dtype='float64'),
+                           'e': [True, False, True],
+                           'f': pd.Categorical(list('abc')),
+                           'g': pd.date_range('20130101', periods=3),
+                           'h': pd.date_range('20130101', periods=3,
+                                              tz='US/Eastern'),
+                           'i': pd.date_range('20130101', periods=3,
+                                              freq='ns')})
+
+        self.check_round_trip(df)
+
+    def test_strided_data_issues(self):
+
+        # strided data issuehttps://github.com/wesm/feather/issues/97
+        df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list('abc'))
+        self.check_error_on_write(df, FeatherError)
+
+    def test_duplicate_columns(self):
+
+        # https://github.com/wesm/feather/issues/53
+        # not currently able to handle duplicate columns
+        df = pd.DataFrame(np.arange(12).reshape(4, 3),
+                          columns=list('aaa')).copy()
+        self.check_error_on_write(df, ValueError)
+
+    def test_stringify_columns(self):
+
+        df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy()
+        self.check_error_on_write(df, ValueError)
+
+    def test_unsupported(self):
+
+        # period
+        df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
+        self.check_error_on_write(df, ValueError)
+
+        # non-strings
+        df = pd.DataFrame({'a': ['a', 1, 2.0]})
+        self.check_error_on_write(df, ValueError)
+
+    def test_write_with_index(self):
+
+        df = pd.DataFrame({'A': [1, 2, 3]})
+        self.check_round_trip(df)
+
+        # non-default index
+        for index in [[2, 3, 4],
+                      pd.date_range('20130101', periods=3),
+                      list('abc'),
+                      [1, 3, 4],
+                      pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
+                                                 ('b', 1)]),
+                      ]:
+
+            df.index = index
+            self.check_error_on_write(df, ValueError)
+
+        # index with meta-data
+        df.index = [0, 1, 2]
+        df.index.name = 'foo'
+        self.check_error_on_write(df, ValueError)
+
+        # column multi-index
+        df.index = [0, 1, 2]
+        df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
+        self.check_error_on_write(df, ValueError)
+
+
+if __name__ == '__main__':
+    nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
+                   exit=False)
diff --git a/pandas/util/print_versions.py b/pandas/util/print_versions.py
index 657681d4c33ce..fac76be676398 100644
--- a/pandas/util/print_versions.py
+++ b/pandas/util/print_versions.py
@@ -80,6 +80,7 @@ def show_versions(as_json=False):
         ("bottleneck", lambda mod: mod.__version__),
         ("tables", lambda mod: mod.__version__),
         ("numexpr", lambda mod: mod.__version__),
+        ("feather", lambda mod: mod.version.version),
         ("matplotlib", lambda mod: mod.__version__),
         ("openpyxl", lambda mod: mod.__version__),
         ("xlrd", lambda mod: mod.__VERSION__),