diff --git a/doc/source/release.rst b/doc/source/release.rst index a3908ab01903d..8488d03f97cbd 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -564,6 +564,8 @@ Bug Fixes - Fixed a bug where ``groupby.plot()`` and friends were duplicating figures multiple times (:issue:`5102`). - Provide automatic conversion of ``object`` dtypes on fillna, related (:issue:`5103`) + - Fixed a bug where default options were being overwritten in the option + parser cleaning (:issue:`5121`). pandas 0.12.0 diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8a2f249f6af06..76d6a3909f89f 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -2,11 +2,10 @@ Module contains tools for processing files into DataFrames or other objects """ from __future__ import print_function -from pandas.compat import range, lrange, StringIO, lzip, zip, string_types +from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map from pandas import compat import re import csv -from warnings import warn import numpy as np @@ -266,7 +265,6 @@ def _read(filepath_or_buffer, kwds): 'buffer_lines': None, 'error_bad_lines': True, 'warn_bad_lines': True, - 'factorize': True, 'dtype': None, 'decimal': b'.' } @@ -340,8 +338,7 @@ def parser_f(filepath_or_buffer, encoding=None, squeeze=False, mangle_dupe_cols=True, - tupleize_cols=False, - ): + tupleize_cols=False): # Alias sep -> delimiter. if delimiter is None: @@ -400,8 +397,7 @@ def parser_f(filepath_or_buffer, low_memory=low_memory, buffer_lines=buffer_lines, mangle_dupe_cols=mangle_dupe_cols, - tupleize_cols=tupleize_cols, - ) + tupleize_cols=tupleize_cols) return _read(filepath_or_buffer, kwds) @@ -490,27 +486,24 @@ def _get_options_with_defaults(self, engine): kwds = self.orig_options options = {} - for argname, default in compat.iteritems(_parser_defaults): - if argname in kwds: - value = kwds[argname] - else: - value = default - options[argname] = value + for argname, default in compat.iteritems(_parser_defaults): + options[argname] = kwds.get(argname, default) for argname, default in compat.iteritems(_c_parser_defaults): if argname in kwds: value = kwds[argname] + if engine != 'c' and value != default: - raise ValueError('%s is not supported with %s parser' % - (argname, engine)) + raise ValueError('The %r option is not supported with the' + ' %r engine' % (argname, engine)) + else: + value = default options[argname] = value if engine == 'python-fwf': for argname, default in compat.iteritems(_fwf_defaults): - if argname in kwds: - value = kwds[argname] - options[argname] = value + options[argname] = kwds.get(argname, default) return options @@ -518,7 +511,9 @@ def _clean_options(self, options, engine): result = options.copy() sep = options['delimiter'] - if (sep is None and not options['delim_whitespace']): + delim_whitespace = options['delim_whitespace'] + + if sep is None and not delim_whitespace: if engine == 'c': print('Using Python parser to sniff delimiter') engine = 'python' @@ -667,21 +662,24 @@ def __init__(self, kwds): self.header = kwds.get('header') if isinstance(self.header,(list,tuple,np.ndarray)): if kwds.get('as_recarray'): - raise Exception("cannot specify as_recarray when " - "specifying a multi-index header") + raise ValueError("cannot specify as_recarray when " + "specifying a multi-index header") if kwds.get('usecols'): - raise Exception("cannot specify usecols when " - "specifying a multi-index header") + raise ValueError("cannot specify usecols when " + "specifying a multi-index header") if kwds.get('names'): - raise Exception("cannot specify names when " - "specifying a multi-index header") + raise ValueError("cannot specify names when " + "specifying a multi-index header") # validate index_col that only contains integers if self.index_col is not None: - if not (isinstance(self.index_col,(list,tuple,np.ndarray)) and all( - [ com.is_integer(i) for i in self.index_col ]) or com.is_integer(self.index_col)): - raise Exception("index_col must only contain row numbers " - "when specifying a multi-index header") + is_sequence = isinstance(self.index_col, (list, tuple, + np.ndarray)) + if not (is_sequence and + all(map(com.is_integer, self.index_col)) or + com.is_integer(self.index_col)): + raise ValueError("index_col must only contain row numbers " + "when specifying a multi-index header") self._name_processed = False diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 44e40dc34ff25..ada6ffdc34257 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1230,17 +1230,17 @@ def test_header_multi_index(self): #### invalid options #### # no as_recarray - self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3], index_col=[0,1], as_recarray=True, tupleize_cols=False) # names - self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3], index_col=[0,1], names=['foo','bar'], tupleize_cols=False) # usecols - self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3], index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False) # non-numeric index_col - self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3], + self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3], index_col=['foo','bar'], tupleize_cols=False) def test_pass_names_with_index(self): @@ -2715,6 +2715,24 @@ def test_warn_if_chunks_have_mismatched_type(self): df = self.read_csv(StringIO(data)) self.assertEqual(df.a.dtype, np.object) + def test_invalid_c_parser_opts_with_not_c_parser(self): + from pandas.io.parsers import _c_parser_defaults as c_defaults + + data = """1,2,3,, +1,2,3,4, +1,2,3,4,5 +1,2,,, +1,2,3,4,""" + + engines = 'python', 'python-fwf' + for default in c_defaults: + for engine in engines: + kwargs = {default: object()} + with tm.assertRaisesRegexp(ValueError, + 'The %r option is not supported ' + 'with the %r engine' % (default, + engine)): + read_csv(StringIO(data), engine=engine, **kwargs) class TestParseSQL(unittest.TestCase): @@ -2783,7 +2801,7 @@ def test_convert_sql_column_decimals(self): def assert_same_values_and_dtype(res, exp): - assert(res.dtype == exp.dtype) + tm.assert_equal(res.dtype, exp.dtype) tm.assert_almost_equal(res, exp) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index d08c020c9e9bc..8625038c57b23 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -237,7 +237,7 @@ cdef class TextReader: cdef: parser_t *parser object file_handle, na_fvalues - bint factorize, na_filter, verbose, has_usecols, has_mi_columns + bint na_filter, verbose, has_usecols, has_mi_columns int parser_start list clocks char *c_encoding @@ -276,7 +276,6 @@ cdef class TextReader: converters=None, - factorize=True, as_recarray=False, skipinitialspace=False, @@ -338,8 +337,6 @@ cdef class TextReader: raise ValueError('only length-1 separators excluded right now') self.parser.delimiter = ord(delimiter) - self.factorize = factorize - #---------------------------------------- # parser options