diff --git a/pandas/formats/format.py b/pandas/formats/format.py index 7706666142a64..0cf6050e515e0 100644 --- a/pandas/formats/format.py +++ b/pandas/formats/format.py @@ -1455,9 +1455,9 @@ def save(self): f = self.path_or_buf close = False else: - f = _get_handle(self.path_or_buf, self.mode, - encoding=self.encoding, - compression=self.compression) + f, handles = _get_handle(self.path_or_buf, self.mode, + encoding=self.encoding, + compression=self.compression) close = True try: diff --git a/pandas/io/common.py b/pandas/io/common.py index 7076d5a62b626..b5a3aec490608 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -1,11 +1,9 @@ """Common IO api utilities""" -import sys import os import csv import codecs import mmap -import zipfile from contextlib import contextmanager, closing from pandas.compat import StringIO, BytesIO, string_types, text_type @@ -141,39 +139,6 @@ def _is_s3_url(url): return False -def maybe_read_encoded_stream(reader, encoding=None, compression=None): - """read an encoded stream from the reader and transform the bytes to - unicode if required based on the encoding - - Parameters - ---------- - reader : a streamable file-like object - encoding : optional, the encoding to attempt to read - - Returns - ------- - a tuple of (a stream of decoded bytes, the encoding which was used) - - """ - - if compat.PY3 or encoding is not None: # pragma: no cover - if encoding: - errors = 'strict' - else: - errors = 'replace' - encoding = 'utf-8' - - if compression == 'gzip': - reader = BytesIO(reader.read()) - else: - reader = StringIO(reader.read().decode(encoding, errors)) - else: - if compression == 'gzip': - reader = BytesIO(reader.read()) - encoding = None - return reader, encoding - - def _expand_user(filepath_or_buffer): """Return the argument with an initial component of ~ or ~user replaced by that user's home directory. @@ -237,18 +202,14 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, """ if _is_url(filepath_or_buffer): - req = _urlopen(str(filepath_or_buffer)) - if compression == 'infer': - content_encoding = req.headers.get('Content-Encoding', None) - if content_encoding == 'gzip': - compression = 'gzip' - else: - compression = None - # cat on the compression to the tuple returned by the function - to_return = (list(maybe_read_encoded_stream(req, encoding, - compression)) + - [compression]) - return tuple(to_return) + url = str(filepath_or_buffer) + req = _urlopen(url) + content_encoding = req.headers.get('Content-Encoding', None) + if content_encoding == 'gzip': + # Override compression based on Content-Encoding header + compression = 'gzip' + reader = BytesIO(req.read()) + return reader, encoding, compression if _is_s3_url(filepath_or_buffer): from pandas.io.s3 import get_filepath_or_buffer @@ -276,64 +237,145 @@ def file_path_to_url(path): return urljoin('file:', pathname2url(path)) -# ZipFile is not a context manager for <= 2.6 -# must be tuple index here since 2.6 doesn't use namedtuple for version_info -if sys.version_info[1] <= 6: - @contextmanager - def ZipFile(*args, **kwargs): - with closing(zipfile.ZipFile(*args, **kwargs)) as zf: - yield zf -else: - ZipFile = zipfile.ZipFile +_compression_to_extension = { + 'gzip': '.gz', + 'bz2': '.bz2', + 'zip': '.zip', + 'xz': '.xz', +} + + +def _infer_compression(filepath_or_buffer, compression): + """ + If compression='infer', infer compression. If compression + """ + + # No compression has been explicitly specified + if compression is None: + return None + + # Cannot infer compression of a buffer. Hence assume no compression. + is_path = isinstance(filepath_or_buffer, compat.string_types) + if compression == 'infer' and not is_path: + return None + + # Infer compression from the filename/URL extension + if compression == 'infer': + for compression, extension in _compression_to_extension.items(): + if filepath_or_buffer.endswith(extension): + return compression + return None + + # Compression has been specified. Check that it's valid + if compression in _compression_to_extension: + return compression + + msg = 'Unrecognized compression type: {}'.format(compression) + valid = ['infer', None] + sorted(_compression_to_extension) + msg += '\nValid compression types are {}'.format(valid) + raise ValueError(msg) -def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): - """Gets file handle for given path and mode. +def _get_handle(path_or_buf, mode, encoding=None, compression=None, + memory_map=False): """ - if compression is not None: - if encoding is not None and not compat.PY3: - msg = 'encoding + compression not yet supported in Python 2' + Get file handle for given path/buffer and mode. + + Parameters + ---------- + path_or_buf : + a path (str) or buffer + mode : str + mode to open path_or_buf with + encoding : str or None + compression : str or None + Supported compression protocols are gzip, bz2, zip, and xz + memory_map : boolean, default False + See parsers._parser_params for more information. + + Returns + ------- + f : file-like + A file-like object + handles : list of file-like objects + A list of file-like object that were openned in this function. + """ + + handles = list() + f = path_or_buf + is_path = isinstance(path_or_buf, compat.string_types) + + if compression: + + if compat.PY2 and not is_path and encoding: + msg = 'compression with encoding is not yet supported in Python 2' raise ValueError(msg) + # GZ Compression if compression == 'gzip': import gzip - f = gzip.GzipFile(path, mode) + if is_path: + f = gzip.open(path_or_buf, mode) + else: + f = gzip.GzipFile(fileobj=path_or_buf) + + # BZ Compression elif compression == 'bz2': import bz2 - f = bz2.BZ2File(path, mode) + if is_path: + f = bz2.BZ2File(path_or_buf, mode) + elif compat.PY2: + # Python 2's bz2 module can't take file objects, so have to + # run through decompress manually + f = StringIO(bz2.decompress(path_or_buf.read())) + path_or_buf.close() + else: + f = bz2.BZ2File(path_or_buf) + + # ZIP Compression elif compression == 'zip': import zipfile - zip_file = zipfile.ZipFile(path) + zip_file = zipfile.ZipFile(path_or_buf) zip_names = zip_file.namelist() - if len(zip_names) == 1: - file_name = zip_names.pop() - f = zip_file.open(file_name) + f = zip_file.open(zip_names.pop()) elif len(zip_names) == 0: raise ValueError('Zero files found in ZIP file {}' - .format(path)) + .format(path_or_buf)) else: raise ValueError('Multiple files found in ZIP file.' - ' Only one file per ZIP :{}' + ' Only one file per ZIP: {}' .format(zip_names)) + + # XZ Compression elif compression == 'xz': lzma = compat.import_lzma() - f = lzma.LZMAFile(path, mode) + f = lzma.LZMAFile(path_or_buf, mode) + + # Unrecognized Compression else: - raise ValueError('Unrecognized compression type: %s' % - compression) - if compat.PY3: - from io import TextIOWrapper - f = TextIOWrapper(f, encoding=encoding) - return f - else: - if compat.PY3: - if encoding: - f = open(path, mode, encoding=encoding) - else: - f = open(path, mode, errors='replace') + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + + handles.append(f) + + elif is_path: + if compat.PY2: + # Python 2 + f = open(path_or_buf, mode) + elif encoding: + # Python 3 and encoding + f = open(path_or_buf, mode, encoding=encoding) else: - f = open(path, mode) + # Python 3 and no explicit encoding + f = open(path_or_buf, mode, errors='replace') + handles.append(f) + + # in Python 3, convert BytesIO or fileobjects passed with an encoding + if compat.PY3 and (compression or isinstance(f, compat.BytesIO)): + from io import TextIOWrapper + f = TextIOWrapper(f, encoding=encoding) + handles.append(f) if memory_map and hasattr(f, 'fileno'): try: @@ -347,7 +389,7 @@ def _get_handle(path, mode, encoding=None, compression=None, memory_map=False): # leave the file handler as is then pass - return f + return f, handles class MMapWrapper(BaseIterator): diff --git a/pandas/io/json.py b/pandas/io/json.py index 878506a6ddc05..5b1a40736ace3 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -259,8 +259,10 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, exists = False if exists: - with _get_handle(filepath_or_buffer, 'r', encoding=encoding) as fh: - json = fh.read() + fh, handles = _get_handle(filepath_or_buffer, 'r', + encoding=encoding) + json = fh.read() + fh.close() else: json = filepath_or_buffer elif hasattr(filepath_or_buffer, 'read'): diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 30443f894a64d..3cd23150bb0bf 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -27,12 +27,11 @@ from pandas.core.frame import DataFrame from pandas.core.categorical import Categorical from pandas.core.common import AbstractMethodError -from pandas.core.config import get_option from pandas.io.date_converters import generic_parser from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, _get_handle, UnicodeReader, UTF8Recoder, BaseIterator, ParserError, EmptyDataError, - ParserWarning, _NA_VALUES) + ParserWarning, _NA_VALUES, _infer_compression) from pandas.tseries import tools from pandas.util.decorators import Appender @@ -354,37 +353,17 @@ def _validate_nrows(nrows): def _read(filepath_or_buffer, kwds): - "Generic reader of line files." + """Generic reader of line files.""" encoding = kwds.get('encoding', None) if encoding is not None: encoding = re.sub('_', '-', encoding).lower() kwds['encoding'] = encoding - # If the input could be a filename, check for a recognizable compression - # extension. If we're reading from a URL, the `get_filepath_or_buffer` - # will use header info to determine compression, so use what it finds in - # that case. - inferred_compression = kwds.get('compression') - if inferred_compression == 'infer': - if isinstance(filepath_or_buffer, compat.string_types): - if filepath_or_buffer.endswith('.gz'): - inferred_compression = 'gzip' - elif filepath_or_buffer.endswith('.bz2'): - inferred_compression = 'bz2' - elif filepath_or_buffer.endswith('.zip'): - inferred_compression = 'zip' - elif filepath_or_buffer.endswith('.xz'): - inferred_compression = 'xz' - else: - inferred_compression = None - else: - inferred_compression = None - + compression = kwds.get('compression') + compression = _infer_compression(filepath_or_buffer, compression) filepath_or_buffer, _, compression = get_filepath_or_buffer( - filepath_or_buffer, encoding, - compression=kwds.get('compression', None)) - kwds['compression'] = (inferred_compression if compression == 'infer' - else compression) + filepath_or_buffer, encoding, compression) + kwds['compression'] = compression if kwds.get('date_parser', None) is not None: if isinstance(kwds['parse_dates'], bool): @@ -1771,70 +1750,6 @@ def count_empty_vals(vals): return sum([1 for v in vals if v == '' or v is None]) -def _wrap_compressed(f, compression, encoding=None): - """wraps compressed fileobject in a decompressing fileobject - NOTE: For all files in Python 3.2 and for bzip'd files under all Python - versions, this means reading in the entire file and then re-wrapping it in - StringIO. - """ - compression = compression.lower() - encoding = encoding or get_option('display.encoding') - - if compression == 'gzip': - import gzip - - f = gzip.GzipFile(fileobj=f) - if compat.PY3: - from io import TextIOWrapper - - f = TextIOWrapper(f) - return f - elif compression == 'bz2': - import bz2 - - if compat.PY3: - f = bz2.open(f, 'rt', encoding=encoding) - else: - # Python 2's bz2 module can't take file objects, so have to - # run through decompress manually - data = bz2.decompress(f.read()) - f = StringIO(data) - return f - elif compression == 'zip': - import zipfile - zip_file = zipfile.ZipFile(f) - zip_names = zip_file.namelist() - - if len(zip_names) == 1: - file_name = zip_names.pop() - f = zip_file.open(file_name) - return f - - elif len(zip_names) == 0: - raise ValueError('Corrupted or zero files found in compressed ' - 'zip file %s', zip_file.filename) - - else: - raise ValueError('Multiple files found in compressed ' - 'zip file %s', str(zip_names)) - - elif compression == 'xz': - - lzma = compat.import_lzma() - f = lzma.LZMAFile(f) - - if compat.PY3: - from io import TextIOWrapper - - f = TextIOWrapper(f) - - return f - - else: - raise ValueError('do not recognize compression method %s' - % compression) - - class PythonParser(ParserBase): def __init__(self, f, **kwds): @@ -1890,20 +1805,10 @@ def __init__(self, f, **kwds): self.comment = kwds['comment'] self._comment_lines = [] - if isinstance(f, compat.string_types): - f = _get_handle(f, 'r', encoding=self.encoding, - compression=self.compression, - memory_map=self.memory_map) - self.handles.append(f) - elif self.compression: - f = _wrap_compressed(f, self.compression, self.encoding) - self.handles.append(f) - # in Python 3, convert BytesIO or fileobjects passed with an encoding - elif compat.PY3 and isinstance(f, compat.BytesIO): - from io import TextIOWrapper - - f = TextIOWrapper(f, encoding=self.encoding) - self.handles.append(f) + f, handles = _get_handle(f, 'r', encoding=self.encoding, + compression=self.compression, + memory_map=self.memory_map) + self.handles.extend(handles) # Set self.data to something that can read lines. if hasattr(f, 'readline'): diff --git a/pandas/io/s3.py b/pandas/io/s3.py index df8f1d9187031..8aa3694834a0a 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -99,9 +99,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, conn = boto.connect_s3(host=s3_host, anon=True) b = conn.get_bucket(parsed_url.netloc, validate=False) - if compat.PY2 and (compression == 'gzip' or - (compression == 'infer' and - filepath_or_buffer.endswith(".gz"))): + if compat.PY2 and compression: k = boto.s3.key.Key(b, parsed_url.path) filepath_or_buffer = BytesIO(k.get_contents_as_string( encoding=encoding)) diff --git a/pandas/io/tests/parser/compression.py b/pandas/io/tests/parser/compression.py index 47ae7be1cbf05..3b0c571032fe6 100644 --- a/pandas/io/tests/parser/compression.py +++ b/pandas/io/tests/parser/compression.py @@ -168,3 +168,8 @@ def test_read_csv_infer_compression(self): tm.assert_frame_equal(expected, df) inputs[3].close() + + def test_invalid_compression(self): + msg = 'Unrecognized compression type: sfark' + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv('test_file.zip', compression='sfark') diff --git a/pandas/io/tests/parser/test_network.py b/pandas/io/tests/parser/test_network.py index 9b02096dd0f26..fd7a1babe4e01 100644 --- a/pandas/io/tests/parser/test_network.py +++ b/pandas/io/tests/parser/test_network.py @@ -7,6 +7,8 @@ import os import nose +import functools +from itertools import product import pandas.util.testing as tm from pandas import DataFrame @@ -14,24 +16,40 @@ from pandas.io.parsers import read_csv, read_table -class TestUrlGz(tm.TestCase): - - def setUp(self): - dirpath = tm.get_data_path() - localtable = os.path.join(dirpath, 'salaries.csv') - self.local_table = read_table(localtable) - - @tm.network - def test_url_gz(self): - url = ('https://raw.github.com/pandas-dev/pandas/' - 'master/pandas/io/tests/parser/data/salaries.csv.gz') - url_table = read_table(url, compression="gzip", engine="python") - tm.assert_frame_equal(url_table, self.local_table) - - @tm.network - def test_url_gz_infer(self): - url = 'https://s3.amazonaws.com/pandas-test/salary.table.gz' - url_table = read_table(url, compression="infer", engine="python") +class TestCompressedUrl(object): + + compression_to_extension = { + 'gzip': '.gz', + 'bz2': '.bz2', + 'zip': '.zip', + 'xz': '.xz', + } + + def __init__(self): + path = os.path.join(tm.get_data_path(), 'salaries.csv') + self.local_table = read_table(path) + self.base_url = ('https://github.com/pandas-dev/pandas/raw/master/' + 'pandas/io/tests/parser/data/salaries.csv') + + def test_compressed_urls(self): + """Test reading compressed tables from URL.""" + msg = ('Test reading {}-compressed tables from URL: ' + 'compression="{}", engine="{}"') + + for compression, extension in self.compression_to_extension.items(): + url = self.base_url + extension + # args is a (compression, engine) tuple + for args in product([compression, 'infer'], ['python']): + # test_fxn is a workaround for more descriptive nose reporting. + # See http://stackoverflow.com/a/37393684/4651668. + test_fxn = functools.partial(self.check_table) + test_fxn.description = msg.format(compression, *args) + yield (test_fxn, url) + args + + def check_table(self, url, compression, engine): + if url.endswith('.xz'): + tm._skip_if_no_lzma() + url_table = read_table(url, compression=compression, engine=engine) tm.assert_frame_equal(url_table, self.local_table)