Skip to content

Commit 064c1b3

Browse files
committed
Move compression inference to io/parsers
1 parent 139bf82 commit 064c1b3

File tree

2 files changed

+34
-38
lines changed

2 files changed

+34
-38
lines changed

pandas/io/common.py

+4-14
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,6 @@ def urlopen(*args, **kwargs):
6363
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
6464
_VALID_URLS.discard('')
6565

66-
_compression_to_extension = {
67-
'gzip': '.gz',
68-
'bz2': '.bz2',
69-
'zip': '.zip',
70-
'xz': '.xz',
71-
}
72-
7366

7467
class CParserError(ValueError):
7568
"""
@@ -243,13 +236,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
243236
if _is_url(filepath_or_buffer):
244237
url = str(filepath_or_buffer)
245238
req = _urlopen(url)
246-
if compression == 'infer':
247-
for compression, extension in _compression_to_extension.items():
248-
if url.endswith(extension):
249-
break
250-
else:
251-
content_encoding = req.headers.get('Content-Encoding', None)
252-
compression = 'gzip' if content_encoding == 'gzip' else None
239+
content_encoding = req.headers.get('Content-Encoding', None)
240+
if content_encoding == 'gzip':
241+
# Override compression based on Content-Encoding header
242+
compression = 'gzip'
253243
reader, encoding = maybe_read_encoded_stream(req, encoding, compression)
254244
return reader, encoding, compression
255245

pandas/io/parsers.py

+30-24
Original file line numberDiff line numberDiff line change
@@ -342,38 +342,44 @@ def _validate_nrows(nrows):
342342
return nrows
343343

344344

345+
_compression_to_extension = {
346+
'gzip': '.gz',
347+
'bz2': '.bz2',
348+
'zip': '.zip',
349+
'xz': '.xz',
350+
}
351+
352+
def _infer_compression(filepath_or_buffer):
353+
"""
354+
Infer compression of a filepath or buffer. In case of buffer, compression
355+
is None. Otherwise, inference is perfomed using the extension of the
356+
filename or URL.
357+
"""
358+
if not isinstance(filepath_or_buffer, compat.string_types):
359+
return None
360+
filepath = str(filepath_or_buffer)
361+
for compression, extension in _compression_to_extension.items():
362+
if filepath.endswith(extension):
363+
return compression
364+
return None
365+
345366
def _read(filepath_or_buffer, kwds):
346-
"Generic reader of line files."
367+
"""Generic reader of line files."""
347368
encoding = kwds.get('encoding', None)
348369
if encoding is not None:
349370
encoding = re.sub('_', '-', encoding).lower()
350371
kwds['encoding'] = encoding
351372

352-
# If the input could be a filename, check for a recognizable compression
353-
# extension. If we're reading from a URL, the `get_filepath_or_buffer`
354-
# will use header info to determine compression, so use what it finds in
355-
# that case.
356-
inferred_compression = kwds.get('compression')
357-
if inferred_compression == 'infer':
358-
if isinstance(filepath_or_buffer, compat.string_types):
359-
if filepath_or_buffer.endswith('.gz'):
360-
inferred_compression = 'gzip'
361-
elif filepath_or_buffer.endswith('.bz2'):
362-
inferred_compression = 'bz2'
363-
elif filepath_or_buffer.endswith('.zip'):
364-
inferred_compression = 'zip'
365-
elif filepath_or_buffer.endswith('.xz'):
366-
inferred_compression = 'xz'
367-
else:
368-
inferred_compression = None
369-
else:
370-
inferred_compression = None
373+
compression = kwds.get('compression')
374+
if compression not in set(_compression_to_extension) | {None, 'infer'}:
375+
raise ValueError('"{}" is not a valid compression'.format(compression))
376+
377+
if compression == 'infer':
378+
compression = _infer_compression(filepath_or_buffer)
371379

372380
filepath_or_buffer, _, compression = get_filepath_or_buffer(
373-
filepath_or_buffer, encoding,
374-
compression=kwds.get('compression', None))
375-
kwds['compression'] = (inferred_compression if compression == 'infer'
376-
else compression)
381+
filepath_or_buffer, encoding, compression)
382+
kwds['compression'] = compression
377383

378384
if kwds.get('date_parser', None) is not None:
379385
if isinstance(kwds['parse_dates'], bool):

0 commit comments

Comments
 (0)