ENH: Support fspath protocol

TomAugspurger · TomAugspurger · commit 14c9e5f1c498 · 2017-05-12T09:27:37.000-05:00
Ensures that most of pandas readers and writers will honor the fspath
protocol, if an object defines it.

TST: remove old xfails
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -20,6 +20,8 @@ Check the :ref:`API Changes <whatsnew_0210.api_breaking>` and :ref:`deprecations
 New features
 ~~~~~~~~~~~~
 
+- Support for `PEP 519 -- Adding a file system path protocol <https://www.python.org/dev/peps/pep-0519/>`_ on most readers and writers (:issue:`13823`)
+
 
 
 .. _whatsnew_0210.enhancements.other:
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -146,17 +146,29 @@ def _validate_header_arg(header):
 
 
 def _stringify_path(filepath_or_buffer):
-    """Return the argument coerced to a string if it was a pathlib.Path
-       or a py.path.local
+    """Attempt to convert a path-like object to a string.
 
     Parameters
     ----------
     filepath_or_buffer : object to be converted
 
     Returns
     -------
-    str_filepath_or_buffer : a the string version of the input path
+    str_filepath_or_buffer : maybe a string version of the object
+
+    Notes
+    -----
+    Objects supporting the fspath protocol (python 3.6+) are coerced
+    according to its __fspath__ method.
+
+    For backwards compatibility with older pythons, pathlib.Path and
+    py.path objects are specially coerced.
+
+    Any other object is passed through unchanged, which includes bytes,
+    strings, buffers, or anything else that's not even path-like.
     """
+    if hasattr(filepath_or_buffer, '__fspath__'):
+        return filepath_or_buffer.__fspath__()
     if _PATHLIB_INSTALLED and isinstance(filepath_or_buffer, pathlib.Path):
         return text_type(filepath_or_buffer)
     if _PY_PATH_INSTALLED and isinstance(filepath_or_buffer, LocalPath):
@@ -180,10 +192,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
     -------
     a filepath_or_buffer, the encoding, the compression
     """
+    filepath_or_buffer = _stringify_path(filepath_or_buffer)
 
     if _is_url(filepath_or_buffer):
-        url = str(filepath_or_buffer)
-        req = _urlopen(url)
+        req = _urlopen(filepath_or_buffer)
         content_encoding = req.headers.get('Content-Encoding', None)
         if content_encoding == 'gzip':
             # Override compression based on Content-Encoding header
@@ -197,9 +209,6 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
                                          encoding=encoding,
                                          compression=compression)
 
-    # Convert pathlib.Path/py.path.local or string
-    filepath_or_buffer = _stringify_path(filepath_or_buffer)
-
     if isinstance(filepath_or_buffer, (compat.string_types,
                                        compat.binary_type,
                                        mmap.mmap)):
diff --git a/pandas/io/excel.py b/pandas/io/excel.py
@@ -18,7 +18,8 @@
 from pandas.io.parsers import TextParser
 from pandas.errors import EmptyDataError
 from pandas.io.common import (_is_url, _urlopen, _validate_header_arg,
-                              get_filepath_or_buffer, _NA_VALUES)
+                              get_filepath_or_buffer, _NA_VALUES,
+                              _stringify_path)
 from pandas.core.indexes.period import Period
 import pandas._libs.json as json
 from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass,
@@ -233,7 +234,10 @@ def __init__(self, io, **kwds):
             raise ImportError("pandas requires xlrd >= 0.9.0 for excel "
                               "support, current version " + xlrd.__VERSION__)
 
+        # could be a str, ExcelFile, Book, etc.
         self.io = io
+        # Always a string
+        self._io = _stringify_path(io)
 
         engine = kwds.pop('engine', None)
 
@@ -242,19 +246,19 @@ def __init__(self, io, **kwds):
 
         # If io is a url, want to keep the data as bytes so can't pass
         # to get_filepath_or_buffer()
-        if _is_url(io):
-            io = _urlopen(io)
-        elif not isinstance(io, (ExcelFile, xlrd.Book)):
-            io, _, _ = get_filepath_or_buffer(io)
-
-        if engine == 'xlrd' and isinstance(io, xlrd.Book):
-            self.book = io
-        elif not isinstance(io, xlrd.Book) and hasattr(io, "read"):
+        if _is_url(self._io):
+            self._io = _urlopen(self._io)
+        elif not isinstance(self.io, (ExcelFile, xlrd.Book)):
+            self._io, _, _ = get_filepath_or_buffer(self._io)
+
+        if engine == 'xlrd' and isinstance(self.io, xlrd.Book):
+            self.book = self.io
+        elif not isinstance(self.io, xlrd.Book) and hasattr(self.io, "read"):
             # N.B. xlrd.Book has a read attribute too
-            data = io.read()
+            data = self.io.read()
             self.book = xlrd.open_workbook(file_contents=data)
-        elif isinstance(io, compat.string_types):
-            self.book = xlrd.open_workbook(io)
+        elif isinstance(self._io, compat.string_types):
+            self.book = xlrd.open_workbook(self._io)
         else:
             raise ValueError('Must explicitly set engine if not passing in'
                              ' buffer or path for io.')
diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py
@@ -3,6 +3,7 @@
 from distutils.version import LooseVersion
 from pandas import DataFrame, RangeIndex, Int64Index
 from pandas.compat import range
+from pandas.io.common import _stringify_path
 
 
 def _try_import():
@@ -43,6 +44,7 @@ def to_feather(df, path):
     path : string
         File path
     """
+    path = _stringify_path(path)
     if not isinstance(df, DataFrame):
         raise ValueError("feather only support IO with DataFrames")
 
@@ -99,4 +101,5 @@ def read_feather(path):
     """
 
     feather = _try_import()
+    path = _stringify_path(path)
     return feather.read_dataframe(path)
diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py
@@ -617,6 +617,9 @@ def write(self, writer, sheet_name='Sheet1', startrow=0,
             and ``io.excel.xlsm.writer``.
         """
         from pandas.io.excel import ExcelWriter
+        from pandas.io.common import _stringify_path
+
+        writer = _stringify_path(writer)
         need_save = False
         if isinstance(writer, string_types):
             writer = ExcelWriter(writer, engine=engine)
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -369,7 +369,10 @@ def __init__(self, frame, buf=None, columns=None, col_space=None,
                  index_names=True, line_width=None, max_rows=None,
                  max_cols=None, show_dimensions=False, decimal='.', **kwds):
         self.frame = frame
-        self.buf = _expand_user(buf) if buf is not None else StringIO()
+        if buf is not None:
+            self.buf = _expand_user(_stringify_path(buf))
+        else:
+            self.buf = StringIO()
         self.show_index_names = index_names
 
         if sparsify is None:
diff --git a/pandas/io/json/json.py b/pandas/io/json/json.py
@@ -7,7 +7,8 @@
 from pandas.compat import StringIO, long, u
 from pandas import compat, isnull
 from pandas import Series, DataFrame, to_datetime, MultiIndex
-from pandas.io.common import get_filepath_or_buffer, _get_handle
+from pandas.io.common import (get_filepath_or_buffer, _get_handle,
+                              _stringify_path)
 from pandas.core.common import AbstractMethodError
 from pandas.io.formats.printing import pprint_thing
 from .normalize import _convert_to_line_delimits
@@ -25,6 +26,7 @@ def to_json(path_or_buf, obj, orient=None, date_format='epoch',
             double_precision=10, force_ascii=True, date_unit='ms',
             default_handler=None, lines=False):
 
+    path_or_buf = _stringify_path(path_or_buf)
     if lines and orient != 'records':
         raise ValueError(
             "'lines' keyword only valid when 'orient' is records")
diff --git a/pandas/io/packers.py b/pandas/io/packers.py
@@ -61,7 +61,7 @@
 from pandas.core.sparse.array import BlockIndex, IntIndex
 from pandas.core.generic import NDFrame
 from pandas.errors import PerformanceWarning
-from pandas.io.common import get_filepath_or_buffer
+from pandas.io.common import get_filepath_or_buffer, _stringify_path
 from pandas.core.internals import BlockManager, make_block, _safe_reshape
 import pandas.core.internals as internals
 
@@ -149,6 +149,7 @@ def writer(fh):
         for a in args:
             fh.write(pack(a, **kwargs))
 
+    path_or_buf = _stringify_path(path_or_buf)
     if isinstance(path_or_buf, compat.string_types):
         with open(path_or_buf, mode) as fh:
             writer(fh)
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -4,7 +4,7 @@
 from numpy.lib.format import read_array, write_array
 from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
 from pandas.core.dtypes.common import is_datetime64_dtype, _NS_DTYPE
-from pandas.io.common import _get_handle, _infer_compression
+from pandas.io.common import _get_handle, _infer_compression, _stringify_path
 
 
 def to_pickle(obj, path, compression='infer'):
@@ -21,6 +21,7 @@ def to_pickle(obj, path, compression='infer'):
 
         .. versionadded:: 0.20.0
     """
+    path = _stringify_path(path)
     inferred_compression = _infer_compression(path, compression)
     f, fh = _get_handle(path, 'wb',
                         compression=inferred_compression,
@@ -56,7 +57,7 @@ def read_pickle(path, compression='infer'):
     -------
     unpickled : type of object stored in file
     """
-
+    path = _stringify_path(path)
     inferred_compression = _infer_compression(path, compression)
 
     def read_wrapper(func):
diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py
@@ -2,6 +2,7 @@
 Read SAS sas7bdat or xport files.
 """
 from pandas import compat
+from pandas.io.common import _stringify_path
 
 
 def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
@@ -34,6 +35,7 @@ def read_sas(filepath_or_buffer, format=None, index=None, encoding=None,
         buffer_error_msg = ("If this is a buffer object rather "
                             "than a string name, you must specify "
                             "a format string")
+        filepath_or_buffer = _stringify_path(filepath_or_buffer)
         if not isinstance(filepath_or_buffer, compat.string_types):
             raise ValueError(buffer_error_msg)
         try:
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -30,7 +30,8 @@
 from pandas.util._decorators import Appender
 import pandas as pd
 
-from pandas.io.common import get_filepath_or_buffer, BaseIterator
+from pandas.io.common import (get_filepath_or_buffer, BaseIterator,
+                              _stringify_path)
 from pandas._libs.lib import max_len_string_array, infer_dtype
 from pandas._libs.tslib import NaT, Timestamp
 
@@ -976,6 +977,7 @@ def __init__(self, path_or_buf, convert_dates=True,
         self._lines_read = 0
 
         self._native_byteorder = _set_endianness(sys.byteorder)
+        path_or_buf = _stringify_path(path_or_buf)
         if isinstance(path_or_buf, str):
             path_or_buf, encoding, _ = get_filepath_or_buffer(
                 path_or_buf, encoding=self._default_encoding
@@ -1930,7 +1932,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True,
         if byteorder is None:
             byteorder = sys.byteorder
         self._byteorder = _set_endianness(byteorder)
-        self._fname = fname
+        self._fname = _stringify_path(fname)
         self.type_converters = {253: np.int32, 252: np.int16, 251: np.int8}
 
     def _write(self, to_write):
diff --git a/pandas/tests/io/data/feather-0_3_1.feather b/pandas/tests/io/data/feather-0_3_1.feather
diff --git a/pandas/tests/io/data/fixed_width_format.txt b/pandas/tests/io/data/fixed_width_format.txt
@@ -0,0 +1,3 @@
+A   B   C
+1   2   3
+4   5   6
diff --git a/pandas/tests/io/msgpack/data/frame.mp b/pandas/tests/io/msgpack/data/frame.mp
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
@@ -66,7 +66,6 @@ def test_from_iterator(self):
                 tm.assert_frame_equal(df, df0.iloc[2:5, :])
                 rdr.close()
 
-    @pytest.mark.xfail(reason="read_sas currently doesn't work with pathlib")
     def test_path_pathlib(self):
         tm._skip_if_no_pathlib()
         from pathlib import Path
@@ -77,7 +76,6 @@ def test_path_pathlib(self):
                 df = pd.read_sas(fname, encoding='utf-8')
                 tm.assert_frame_equal(df, df0)
 
-    @pytest.mark.xfail(reason="read_sas currently doesn't work with localpath")
     def test_path_localpath(self):
         tm._skip_if_no_localpath()
         from py.path import local as LocalPath
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -6,6 +6,7 @@
 import os
 from os.path import isabs
 
+import pandas as pd
 import pandas.util.testing as tm
 
 from pandas.io import common
@@ -24,6 +25,18 @@
     pass
 
 
+class CustomFSPath(object):
+    """For testing fspath on unknown objects"""
+    def __init__(self, path):
+        self.path = path
+
+    def __fspath__(self):
+        return self.path
+
+
+HERE = os.path.dirname(__file__)
+
+
 class TestCommonIOCapabilities(object):
     data1 = """index,A,B,C,D
 foo,2,3,4,5
@@ -65,6 +78,11 @@ def test_stringify_path_localpath(self):
         lpath = LocalPath(path)
         assert common._stringify_path(lpath) == abs_path
 
+    def test_stringify_path_fspath(self):
+        p = CustomFSPath('foo/bar.csv')
+        result = common._stringify_path(p)
+        assert result == 'foo/bar.csv'
+
     def test_get_filepath_or_buffer_with_path(self):
         filename = '~/sometest'
         filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename)
@@ -89,6 +107,69 @@ def test_iterator(self):
         tm.assert_frame_equal(first, expected.iloc[[0]])
         tm.assert_frame_equal(concat(it), expected.iloc[1:])
 
+    @pytest.mark.parametrize('reader, module, path', [
+        (pd.read_csv, 'os', os.path.join(HERE, 'data', 'iris.csv')),
+        (pd.read_table, 'os', os.path.join(HERE, 'data', 'iris.csv')),
+        (pd.read_fwf, 'os', os.path.join(HERE, 'data',
+                                         'fixed_width_format.txt')),
+        (pd.read_excel, 'xlrd', os.path.join(HERE, 'data', 'test1.xlsx')),
+        (pd.read_feather, 'feather', os.path.join(HERE, 'data',
+                                                  'feather-0_3_1.feather')),
+        (pd.read_hdf, 'tables', os.path.join(HERE, 'data', 'legacy_hdf',
+                                             'datetimetz_object.h5')),
+        (pd.read_stata, 'os', os.path.join(HERE, 'data', 'stata10_115.dta')),
+        (pd.read_sas, 'os', os.path.join(HERE, 'sas', 'data',
+                                         'test1.sas7bdat')),
+        (pd.read_json, 'os', os.path.join(HERE, 'json', 'data',
+                                          'tsframe_v012.json')),
+        (pd.read_msgpack, 'os', os.path.join(HERE, 'msgpack', 'data',
+                                             'frame.mp')),
+        (pd.read_pickle, 'os', os.path.join(HERE, 'data',
+                                            'categorical_0_14_1.pickle')),
+    ])
+    def test_read_fspath_all(self, reader, module, path):
+        pytest.importorskip(module)
+        mypath = CustomFSPath(path)
+        result = reader(mypath)
+        expected = reader(mypath)
+        if path.endswith('.pickle'):
+            # categorical
+            tm.assert_categorical_equal(result, expected)
+        else:
+            tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize('writer_name, writer_kwargs, module', [
+        ('to_csv', {}, 'os'),
+        ('to_excel', {'engine': 'xlwt'}, 'xlwt'),
+        ('to_feather', {}, 'feather'),
+        ('to_hdf', {'key': 'bar', 'mode': 'w'}, 'tables'),
+        ('to_html', {}, 'os'),
+        ('to_json', {}, 'os'),
+        ('to_latex', {}, 'os'),
+        ('to_msgpack', {}, 'os'),
+        ('to_pickle', {}, 'os'),
+        ('to_stata', {}, 'os'),
+    ])
+    def test_write_fspath_all(self, writer_name, writer_kwargs, module):
+        p1 = tm.ensure_clean('string')
+        p2 = tm.ensure_clean('fspath')
+        df = pd.DataFrame({"A": [1, 2]})
+
+        with p1 as string, p2 as fspath:
+            pytest.importorskip(module)
+            mypath = CustomFSPath(fspath)
+            writer = getattr(df, writer_name)
+
+            writer(string, **writer_kwargs)
+            with open(string, 'rb') as f:
+                expected = f.read()
+
+            writer(mypath, **writer_kwargs)
+            with open(fspath, 'rb') as f:
+                result = f.read()
+
+            assert result == expected
+
 
 class TestMMapWrapper(object):
 
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
diff --git a/pandas/tests/io/test_packers.py b/pandas/tests/io/test_packers.py
diff --git a/pandas/tests/io/test_pytables.py b/pandas/tests/io/test_pytables.py
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py