Skip to content

Commit 10652a0

Browse files
committed
Move compression inference to io/parsers
1 parent 8d24bcf commit 10652a0

File tree

2 files changed

+34
-38
lines changed

2 files changed

+34
-38
lines changed

pandas/io/common.py

+4-14
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,6 @@ def urlopen(*args, **kwargs):
6363
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
6464
_VALID_URLS.discard('')
6565

66-
_compression_to_extension = {
67-
'gzip': '.gz',
68-
'bz2': '.bz2',
69-
'zip': '.zip',
70-
'xz': '.xz',
71-
}
72-
7366

7467
class ParserError(ValueError):
7568
"""
@@ -245,13 +238,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
245238
if _is_url(filepath_or_buffer):
246239
url = str(filepath_or_buffer)
247240
req = _urlopen(url)
248-
if compression == 'infer':
249-
for compression, extension in _compression_to_extension.items():
250-
if url.endswith(extension):
251-
break
252-
else:
253-
content_encoding = req.headers.get('Content-Encoding', None)
254-
compression = 'gzip' if content_encoding == 'gzip' else None
241+
content_encoding = req.headers.get('Content-Encoding', None)
242+
if content_encoding == 'gzip':
243+
# Override compression based on Content-Encoding header
244+
compression = 'gzip'
255245
reader, encoding = maybe_read_encoded_stream(req, encoding, compression)
256246
return reader, encoding, compression
257247

pandas/io/parsers.py

+30-24
Original file line numberDiff line numberDiff line change
@@ -353,38 +353,44 @@ def _validate_nrows(nrows):
353353
return nrows
354354

355355

356+
_compression_to_extension = {
357+
'gzip': '.gz',
358+
'bz2': '.bz2',
359+
'zip': '.zip',
360+
'xz': '.xz',
361+
}
362+
363+
def _infer_compression(filepath_or_buffer):
364+
"""
365+
Infer compression of a filepath or buffer. In case of buffer, compression
366+
is None. Otherwise, inference is perfomed using the extension of the
367+
filename or URL.
368+
"""
369+
if not isinstance(filepath_or_buffer, compat.string_types):
370+
return None
371+
filepath = str(filepath_or_buffer)
372+
for compression, extension in _compression_to_extension.items():
373+
if filepath.endswith(extension):
374+
return compression
375+
return None
376+
356377
def _read(filepath_or_buffer, kwds):
357-
"Generic reader of line files."
378+
"""Generic reader of line files."""
358379
encoding = kwds.get('encoding', None)
359380
if encoding is not None:
360381
encoding = re.sub('_', '-', encoding).lower()
361382
kwds['encoding'] = encoding
362383

363-
# If the input could be a filename, check for a recognizable compression
364-
# extension. If we're reading from a URL, the `get_filepath_or_buffer`
365-
# will use header info to determine compression, so use what it finds in
366-
# that case.
367-
inferred_compression = kwds.get('compression')
368-
if inferred_compression == 'infer':
369-
if isinstance(filepath_or_buffer, compat.string_types):
370-
if filepath_or_buffer.endswith('.gz'):
371-
inferred_compression = 'gzip'
372-
elif filepath_or_buffer.endswith('.bz2'):
373-
inferred_compression = 'bz2'
374-
elif filepath_or_buffer.endswith('.zip'):
375-
inferred_compression = 'zip'
376-
elif filepath_or_buffer.endswith('.xz'):
377-
inferred_compression = 'xz'
378-
else:
379-
inferred_compression = None
380-
else:
381-
inferred_compression = None
384+
compression = kwds.get('compression')
385+
if compression not in set(_compression_to_extension) | {None, 'infer'}:
386+
raise ValueError('"{}" is not a valid compression'.format(compression))
387+
388+
if compression == 'infer':
389+
compression = _infer_compression(filepath_or_buffer)
382390

383391
filepath_or_buffer, _, compression = get_filepath_or_buffer(
384-
filepath_or_buffer, encoding,
385-
compression=kwds.get('compression', None))
386-
kwds['compression'] = (inferred_compression if compression == 'infer'
387-
else compression)
392+
filepath_or_buffer, encoding, compression)
393+
kwds['compression'] = compression
388394

389395
if kwds.get('date_parser', None) is not None:
390396
if isinstance(kwds['parse_dates'], bool):

0 commit comments

Comments
 (0)