Skip to content

Commit 5278fb5

Browse files
committed
DOC, ENH: Support memory_map for Python engine
[ci skip]
1 parent 158ae5b commit 5278fb5

File tree

9 files changed

+127
-7
lines changed

9 files changed

+127
-7
lines changed

doc/source/io.rst

+4
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,10 @@ use_unsigned : boolean, default False
198198

199199
If integer columns are being compacted (i.e. ``compact_ints=True``), specify whether
200200
the column should be compacted to the smallest signed or unsigned integer dtype.
201+
memory_map : boolean, default False
202+
If a filepath is provided for ``filepath_or_buffer``, map the file object
203+
directly onto memory and access the data directly from there. Using this
204+
option can improve performance because there is no longer any I/O overhead.
201205

202206
NA and Missing Data Handling
203207
++++++++++++++++++++++++++++

doc/source/whatsnew/v0.18.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ Other enhancements
7676

7777
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`)
7878
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``na_filter`` option (:issue:`13321`)
79+
- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``memory_map`` option (:issue:`13381`)
7980

8081
- ``Index.astype()`` now accepts an optional boolean argument ``copy``, which allows optional copying if the requirements on dtype are satisfied (:issue:`13209`)
8182
- ``Index`` now supports the ``.where()`` function for same shape indexing (:issue:`13170`)

pandas/io/common.py

+48-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import os
55
import csv
66
import codecs
7+
import mmap
78
import zipfile
89
from contextlib import contextmanager, closing
910

@@ -276,7 +277,7 @@ def ZipFile(*args, **kwargs):
276277
ZipFile = zipfile.ZipFile
277278

278279

279-
def _get_handle(path, mode, encoding=None, compression=None):
280+
def _get_handle(path, mode, encoding=None, compression=None, memory_map=False):
280281
"""Gets file handle for given path and mode.
281282
"""
282283
if compression is not None:
@@ -324,9 +325,55 @@ def _get_handle(path, mode, encoding=None, compression=None):
324325
else:
325326
f = open(path, mode)
326327

328+
if memory_map and hasattr(f, 'fileno'):
329+
try:
330+
f = MMapWrapper(f)
331+
except Exception:
332+
# we catch any errors that may have occurred
333+
# because that is consistent with the lower-level
334+
# functionality of the C engine (pd.read_csv), so
335+
# leave the file handler as is then
336+
pass
337+
327338
return f
328339

329340

341+
class MMapWrapper(BaseIterator):
342+
"""
343+
Wrapper for the Python's mmap class so that it can be properly read in
344+
by Python's csv.reader class.
345+
346+
Parameters
347+
----------
348+
f : file object
349+
File object to be mapped onto memory. Must support the 'fileno'
350+
method or have an equivalent attribute
351+
352+
"""
353+
354+
def __init__(self, f):
355+
self.mmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
356+
357+
def __getattr__(self, name):
358+
return getattr(self.mmap, name)
359+
360+
def __next__(self):
361+
newline = self.mmap.readline()
362+
363+
# readline returns bytes, not str, in Python 3,
364+
# but Python's CSV reader expects str, so convert
365+
# the output to str before continuing
366+
if compat.PY3:
367+
newline = compat.bytes_to_str(newline)
368+
369+
# mmap doesn't raise if reading past the allocated
370+
# data but instead returns an empty string, so raise
371+
# if that is returned
372+
if newline == '':
373+
raise StopIteration
374+
return newline
375+
376+
330377
class UTF8Recoder(BaseIterator):
331378

332379
"""

pandas/io/parsers.py

+7-2
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,10 @@
261261
If integer columns are being compacted (i.e. `compact_ints=True`), specify
262262
whether the column should be compacted to the smallest signed or unsigned
263263
integer dtype.
264+
memory_map : boolean, default False
265+
If a filepath is provided for `filepath_or_buffer`, map the file object
266+
directly onto memory and access the data directly from there. Using this
267+
option can improve performance because there is no longer any I/O overhead.
264268
265269
Returns
266270
-------
@@ -459,7 +463,6 @@ def _read(filepath_or_buffer, kwds):
459463
_c_unsupported = set(['skip_footer'])
460464
_python_unsupported = set([
461465
'low_memory',
462-
'memory_map',
463466
'buffer_lines',
464467
'error_bad_lines',
465468
'warn_bad_lines',
@@ -1683,6 +1686,7 @@ def __init__(self, f, **kwds):
16831686

16841687
self.encoding = kwds['encoding']
16851688
self.compression = kwds['compression']
1689+
self.memory_map = kwds['memory_map']
16861690
self.skiprows = kwds['skiprows']
16871691

16881692
self.skip_footer = kwds['skip_footer']
@@ -1718,7 +1722,8 @@ def __init__(self, f, **kwds):
17181722

17191723
if isinstance(f, compat.string_types):
17201724
f = _get_handle(f, 'r', encoding=self.encoding,
1721-
compression=self.compression)
1725+
compression=self.compression,
1726+
memory_map=self.memory_map)
17221727
elif self.compression:
17231728
f = _wrap_compressed(f, self.compression, self.encoding)
17241729
# in Python 3, convert BytesIO or fileobjects passed with an encoding

pandas/io/tests/data/test_mmap.csv

+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
a,b,c
2+
1,one,I
3+
2,two,II
4+
5+
3,three,III

pandas/io/tests/parser/c_parser_only.py

-4
Original file line numberDiff line numberDiff line change
@@ -285,10 +285,6 @@ def test_usecols_dtypes(self):
285285
self.assertTrue((result.dtypes == [object, np.int, np.float]).all())
286286
self.assertTrue((result2.dtypes == [object, np.float]).all())
287287

288-
def test_memory_map(self):
289-
# it works!
290-
self.read_csv(self.csv1, memory_map=True)
291-
292288
def test_disable_bool_parsing(self):
293289
# #2090
294290

pandas/io/tests/parser/common.py

+11
Original file line numberDiff line numberDiff line change
@@ -1458,3 +1458,14 @@ def test_as_recarray(self):
14581458
out = self.read_csv(StringIO(data), as_recarray=True,
14591459
usecols=['a'])
14601460
tm.assert_numpy_array_equal(out, expected)
1461+
1462+
def test_memory_map(self):
1463+
mmap_file = os.path.join(self.dirpath, 'test_mmap.csv')
1464+
expected = DataFrame({
1465+
'a': [1, 2, 3],
1466+
'b': ['one', 'two', 'three'],
1467+
'c': ['I', 'II', 'III']
1468+
})
1469+
1470+
out = self.read_csv(mmap_file, memory_map=True)
1471+
tm.assert_frame_equal(out, expected)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
a,b,c
2+
1,one,I
3+
2,two,II
4+
3,three,III

pandas/io/tests/test_common.py

+47
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
Tests for the pandas.io.common functionalities
33
"""
44
from pandas.compat import StringIO
5+
import mmap
56
import os
67
from os.path import isabs
78

@@ -87,3 +88,49 @@ def test_iterator(self):
8788
tm.assert_frame_equal(first, expected.iloc[[0]])
8889
expected.index = [0 for i in range(len(expected))]
8990
tm.assert_frame_equal(concat(it), expected.iloc[1:])
91+
92+
93+
class TestMMapWrapper(tm.TestCase):
94+
95+
def setUp(self):
96+
self.mmap_file = os.path.join(tm.get_data_path(),
97+
'test_mmap.csv')
98+
99+
def test_constructor_bad_file(self):
100+
non_file = StringIO('I am not a file')
101+
non_file.fileno = lambda: -1
102+
103+
msg = "Invalid argument"
104+
tm.assertRaisesRegexp(mmap.error, msg, common.MMapWrapper, non_file)
105+
106+
target = open(self.mmap_file, 'r')
107+
target.close()
108+
109+
msg = "I/O operation on closed file"
110+
tm.assertRaisesRegexp(ValueError, msg, common.MMapWrapper, target)
111+
112+
def test_get_attr(self):
113+
target = open(self.mmap_file, 'r')
114+
wrapper = common.MMapWrapper(target)
115+
116+
attrs = dir(wrapper.mmap)
117+
attrs = [attr for attr in attrs
118+
if not attr.startswith('__')]
119+
attrs.append('__next__')
120+
121+
for attr in attrs:
122+
self.assertTrue(hasattr(wrapper, attr))
123+
124+
self.assertFalse(hasattr(wrapper, 'foo'))
125+
126+
def test_next(self):
127+
target = open(self.mmap_file, 'r')
128+
wrapper = common.MMapWrapper(target)
129+
130+
lines = target.readlines()
131+
132+
for line in lines:
133+
next_line = next(wrapper)
134+
self.assertEqual(next_line, line)
135+
136+
self.assertRaises(StopIteration, next, wrapper)

0 commit comments

Comments
 (0)