DOC, ENH: Support memory_map for Python engine

gfyoung · gfyoung · commit 5278fb5c507c · 2016-06-08T11:04:44.000+01:00
[ci skip]
diff --git a/doc/source/io.rst b/doc/source/io.rst
@@ -198,6 +198,10 @@ use_unsigned : boolean, default False
 
   If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether
   the column should be compacted to the smallest signed or unsigned integer dtype.
+memory_map : boolean, default False
+  If a filepath is provided for ``filepath_or_buffer``, map the file object
+  directly onto memory and access the data directly from there. Using this
+  option can improve performance because there is no longer any I/O overhead.
 
 NA and Missing Data Handling
 ++++++++++++++++++++++++++++
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -76,6 +76,7 @@ Other enhancements
 
 - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)
 - The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`)
+- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`)
 
 - ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
 - ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -4,6 +4,7 @@
 import os
 import csv
 import codecs
+import mmap
 import zipfile
 from contextlib import contextmanager, closing
 
@@ -276,7 +277,7 @@ def ZipFile(*args, **kwargs):
     ZipFile = zipfile.ZipFile
 
 
-def _get_handle(path, mode, encoding=None, compression=None):
+def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
     """Gets file handle for given path and mode.
     """
     if compression is not None:
@@ -324,9 +325,55 @@ def _get_handle(path, mode, encoding=None, compression=None):
         else:
             f = open(path, mode)
 
+    if memory_map and hasattr(f, 'fileno'):
+        try:
+            f = MMapWrapper(f)
+        except Exception:
+            # we catch any errors that may have occurred
+            # because that is consistent with the lower-level
+            # functionality of the C engine (pd.read_csv), so
+            # leave the file handler as is then
+            pass
+
     return f
 
 
+class MMapWrapper(BaseIterator):
+    """
+    Wrapper for the Python's mmap class so that it can be properly read in
+    by Python's csv.reader class.
+
+    Parameters
+    ----------
+    f : file object
+        File object to be mapped onto memory. Must support the 'fileno'
+        method or have an equivalent attribute
+
+    """
+
+    def __init__(self, f):
+        self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
+
+    def __getattr__(self, name):
+        return getattr(self.mmap, name)
+
+    def __next__(self):
+        newline = self.mmap.readline()
+
+        # readline returns bytes, not str, in Python 3,
+        # but Python's CSV reader expects str, so convert
+        # the output to str before continuing
+        if compat.PY3:
+            newline = compat.bytes_to_str(newline)
+
+        # mmap doesn't raise if reading past the allocated
+        # data but instead returns an empty string, so raise
+        # if that is returned
+        if newline == '':
+            raise StopIteration
+        return newline
+
+
 class UTF8Recoder(BaseIterator):
 
     """
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -261,6 +261,10 @@
     If integer columns are being compacted (i.e. `compact_ints=True`), specify
     whether the column should be compacted to the smallest signed or unsigned
     integer dtype.
+memory_map : boolean, default False
+    If a filepath is provided for `filepath_or_buffer`, map the file object
+    directly onto memory and access the data directly from there. Using this
+    option can improve performance because there is no longer any I/O overhead.
 
 Returns
 -------
@@ -459,7 +463,6 @@ def _read(filepath_or_buffer, kwds):
 _c_unsupported = set(['skip_footer'])
 _python_unsupported = set([
     'low_memory',
-    'memory_map',
     'buffer_lines',
     'error_bad_lines',
     'warn_bad_lines',
@@ -1683,6 +1686,7 @@ def __init__(self, f, **kwds):
 
         self.encoding = kwds['encoding']
         self.compression = kwds['compression']
+        self.memory_map = kwds['memory_map']
         self.skiprows = kwds['skiprows']
 
         self.skip_footer = kwds['skip_footer']
@@ -1718,7 +1722,8 @@ def __init__(self, f, **kwds):
 
         if isinstance(f, compat.string_types):
             f = _get_handle(f, 'r', encoding=self.encoding,
-                            compression=self.compression)
+                            compression=self.compression,
+                            memory_map=self.memory_map)
         elif self.compression:
             f = _wrap_compressed(f, self.compression, self.encoding)
         # in Python 3, convert BytesIO or fileobjects passed with an encoding
diff --git a/pandas/io/tests/data/test_mmap.csv b/pandas/io/tests/data/test_mmap.csv
@@ -0,0 +1,5 @@
+a,b,c
+1,one,I
+2,two,II
+
+3,three,III
diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py
@@ -285,10 +285,6 @@ def test_usecols_dtypes(self):
         self.assertTrue((result.dtypes == [object, np.int, np.float]).all())
         self.assertTrue((result2.dtypes == [object, np.float]).all())
 
-    def test_memory_map(self):
-        # it works!
-        self.read_csv(self.csv1, memory_map=True)
-
     def test_disable_bool_parsing(self):
         # #2090
 
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -1458,3 +1458,14 @@ def test_as_recarray(self):
             out = self.read_csv(StringIO(data), as_recarray=True,
                                 usecols=['a'])
             tm.assert_numpy_array_equal(out, expected)
+
+    def test_memory_map(self):
+        mmap_file = os.path.join(self.dirpath, 'test_mmap.csv')
+        expected = DataFrame({
+            'a': [1, 2, 3],
+            'b': ['one', 'two', 'three'],
+            'c': ['I', 'II', 'III']
+        })
+
+        out = self.read_csv(mmap_file, memory_map=True)
+        tm.assert_frame_equal(out, expected)
diff --git a/pandas/io/tests/parser/data/test_mmap.csv b/pandas/io/tests/parser/data/test_mmap.csv
@@ -0,0 +1,4 @@
+a,b,c
+1,one,I
+2,two,II
+3,three,III
diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py
@@ -2,6 +2,7 @@
     Tests for the pandas.io.common functionalities
 """
 from pandas.compat import StringIO
+import mmap
 import os
 from os.path import isabs
 
@@ -87,3 +88,49 @@ def test_iterator(self):
         tm.assert_frame_equal(first, expected.iloc[[0]])
         expected.index = [0 for i in range(len(expected))]
         tm.assert_frame_equal(concat(it), expected.iloc[1:])
+
+
+class TestMMapWrapper(tm.TestCase):
+
+    def setUp(self):
+        self.mmap_file = os.path.join(tm.get_data_path(),
+                                      'test_mmap.csv')
+
+    def test_constructor_bad_file(self):
+        non_file = StringIO('I am not a file')
+        non_file.fileno = lambda: -1
+
+        msg = "Invalid argument"
+        tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file)
+
+        target = open(self.mmap_file, 'r')
+        target.close()
+
+        msg = "I/O operation on closed file"
+        tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target)
+
+    def test_get_attr(self):
+        target = open(self.mmap_file, 'r')
+        wrapper = common.MMapWrapper(target)
+
+        attrs = dir(wrapper.mmap)
+        attrs = [attr for attr in attrs
+                 if not attr.startswith('__')]
+        attrs.append('__next__')
+
+        for attr in attrs:
+            self.assertTrue(hasattr(wrapper, attr))
+
+        self.assertFalse(hasattr(wrapper, 'foo'))
+
+    def test_next(self):
+        target = open(self.mmap_file, 'r')
+        wrapper = common.MMapWrapper(target)
+
+        lines = target.readlines()
+
+        for line in lines:
+            next_line = next(wrapper)
+            self.assertEqual(next_line, line)
+
+        self.assertRaises(StopIteration, next, wrapper)