CLN: clean up parser options

cpcloud · cpcloud · commit 4881213a2e28 · 2013-10-05T18:35:39.000-04:00
Also add a test to make sure that the C parser options validation is actually
covered
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -564,6 +564,8 @@ Bug Fixes
   - Fixed a bug where ``groupby.plot()`` and friends were duplicating figures
     multiple times (:issue:`5102`).
   - Provide automatic conversion of ``object`` dtypes on fillna, related (:issue:`5103`)
+  - Fixed a bug where default options were being overwritten in the option
+    parser cleaning (:issue:`5121`).
 
 
 pandas 0.12.0
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -341,7 +341,7 @@ def parser_f(filepath_or_buffer,
                  squeeze=False,
                  mangle_dupe_cols=True,
                  tupleize_cols=False,
-                 ):
+                 factorize=True):
 
         # Alias sep -> delimiter.
         if delimiter is None:
@@ -401,7 +401,7 @@ def parser_f(filepath_or_buffer,
                     buffer_lines=buffer_lines,
                     mangle_dupe_cols=mangle_dupe_cols,
                     tupleize_cols=tupleize_cols,
-            )
+                    factorize=factorize)
 
         return _read(filepath_or_buffer, kwds)
 
@@ -490,35 +490,34 @@ def _get_options_with_defaults(self, engine):
         kwds = self.orig_options
 
         options = {}
-        for argname, default in compat.iteritems(_parser_defaults):
-            if argname in kwds:
-                value = kwds[argname]
-            else:
-                value = default
 
-            options[argname] = value
+        for argname, default in compat.iteritems(_parser_defaults):
+            options[argname] = kwds.get(argname, default)
 
         for argname, default in compat.iteritems(_c_parser_defaults):
             if argname in kwds:
                 value = kwds[argname]
+
                 if engine != 'c' and value != default:
-                    raise ValueError('%s is not supported with %s parser' %
-                                     (argname, engine))
+                    raise ValueError('The %r option is not supported with the'
+                                     ' %r engine' % (argname, engine))
+            else:
+                value = default
             options[argname] = value
 
         if engine == 'python-fwf':
             for argname, default in compat.iteritems(_fwf_defaults):
-                if argname in kwds:
-                    value = kwds[argname]
-                options[argname] = value
+                options[argname] = kwds.get(argname, default)
 
         return options
 
     def _clean_options(self, options, engine):
         result = options.copy()
 
         sep = options['delimiter']
-        if (sep is None and not options['delim_whitespace']):
+        delim_whitespace = options['delim_whitespace']
+
+        if sep is None and not delim_whitespace:
             if engine == 'c':
                 print('Using Python parser to sniff delimiter')
                 engine = 'python'
@@ -667,21 +666,24 @@ def __init__(self, kwds):
         self.header = kwds.get('header')
         if isinstance(self.header,(list,tuple,np.ndarray)):
             if kwds.get('as_recarray'):
-                raise Exception("cannot specify as_recarray when "
-                                "specifying a multi-index header")
+                raise ValueError("cannot specify as_recarray when "
+                                 "specifying a multi-index header")
             if kwds.get('usecols'):
-                raise Exception("cannot specify usecols when "
-                                "specifying a multi-index header")
+                raise ValueError("cannot specify usecols when "
+                                 "specifying a multi-index header")
             if kwds.get('names'):
-                raise Exception("cannot specify names when "
-                                "specifying a multi-index header")
+                raise ValueError("cannot specify names when "
+                                 "specifying a multi-index header")
 
             # validate index_col that only contains integers
             if self.index_col is not None:
-                if not (isinstance(self.index_col,(list,tuple,np.ndarray)) and all(
-                        [ com.is_integer(i) for i in self.index_col ]) or com.is_integer(self.index_col)):
-                    raise Exception("index_col must only contain row numbers "
-                                    "when specifying a multi-index header")
+                is_sequence = isinstance(self.index_col, (list, tuple,
+                                                          np.ndarray))
+                if not (is_sequence and
+                        all(com.is_integer(i) for i in self.index_col) or
+                        com.is_integer(self.index_col)):
+                    raise ValueError("index_col must only contain row numbers "
+                                     "when specifying a multi-index header")
 
         self._name_processed = False
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1230,17 +1230,17 @@ def test_header_multi_index(self):
         #### invalid options ####
 
         # no as_recarray
-        self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=[0,1], as_recarray=True, tupleize_cols=False)
 
         # names
-        self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=[0,1], names=['foo','bar'], tupleize_cols=False)
         # usecols
-        self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False)
         # non-numeric index_col
-        self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=['foo','bar'], tupleize_cols=False)
 
     def test_pass_names_with_index(self):
@@ -2715,6 +2715,24 @@ def test_warn_if_chunks_have_mismatched_type(self):
             df = self.read_csv(StringIO(data))
         self.assertEqual(df.a.dtype, np.object)
 
+    def test_invalid_c_parser_opts_with_not_c_parser(self):
+        from pandas.io.parsers import _c_parser_defaults as c_defaults
+
+        data = """1,2,3,,
+1,2,3,4,
+1,2,3,4,5
+1,2,,,
+1,2,3,4,"""
+
+        engines = 'python', 'python-fwf'
+        for default in c_defaults:
+            for engine in engines:
+                kwargs = {default: object()}
+                with tm.assertRaisesRegexp(ValueError,
+                                           'The %r option is not supported '
+                                           'with the %r engine' % (default,
+                                                                   engine)):
+                    read_csv(StringIO(data), engine=engine, **kwargs)
 
 class TestParseSQL(unittest.TestCase):
 
@@ -2783,7 +2801,7 @@ def test_convert_sql_column_decimals(self):
 
 
 def assert_same_values_and_dtype(res, exp):
-    assert(res.dtype == exp.dtype)
+    tm.assert_equal(res.dtype, exp.dtype)
     tm.assert_almost_equal(res, exp)