Merge pull request #5121 from cpcloud/parser-options-mania

cpcloud · cpcloud · commit bea505109c88 · 2013-10-05T16:28:25.000-07:00
CLN: clean up parser options
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -564,6 +564,8 @@ Bug Fixes
   - Fixed a bug where ``groupby.plot()`` and friends were duplicating figures
     multiple times (:issue:`5102`).
   - Provide automatic conversion of ``object`` dtypes on fillna, related (:issue:`5103`)
+  - Fixed a bug where default options were being overwritten in the option
+    parser cleaning (:issue:`5121`).
 
 
 pandas 0.12.0
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -2,11 +2,10 @@
 Module contains tools for processing files into DataFrames or other objects
 """
 from __future__ import print_function
-from pandas.compat import range, lrange, StringIO, lzip, zip, string_types
+from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map
 from pandas import compat
 import re
 import csv
-from warnings import warn
 
 import numpy as np
 
@@ -266,7 +265,6 @@ def _read(filepath_or_buffer, kwds):
     'buffer_lines': None,
     'error_bad_lines': True,
     'warn_bad_lines': True,
-    'factorize': True,
     'dtype': None,
     'decimal': b'.'
 }
@@ -340,8 +338,7 @@ def parser_f(filepath_or_buffer,
                  encoding=None,
                  squeeze=False,
                  mangle_dupe_cols=True,
-                 tupleize_cols=False,
-                 ):
+                 tupleize_cols=False):
 
         # Alias sep -> delimiter.
         if delimiter is None:
@@ -400,8 +397,7 @@ def parser_f(filepath_or_buffer,
                     low_memory=low_memory,
                     buffer_lines=buffer_lines,
                     mangle_dupe_cols=mangle_dupe_cols,
-                    tupleize_cols=tupleize_cols,
-            )
+                    tupleize_cols=tupleize_cols)
 
         return _read(filepath_or_buffer, kwds)
 
@@ -490,35 +486,34 @@ def _get_options_with_defaults(self, engine):
         kwds = self.orig_options
 
         options = {}
-        for argname, default in compat.iteritems(_parser_defaults):
-            if argname in kwds:
-                value = kwds[argname]
-            else:
-                value = default
 
-            options[argname] = value
+        for argname, default in compat.iteritems(_parser_defaults):
+            options[argname] = kwds.get(argname, default)
 
         for argname, default in compat.iteritems(_c_parser_defaults):
             if argname in kwds:
                 value = kwds[argname]
+
                 if engine != 'c' and value != default:
-                    raise ValueError('%s is not supported with %s parser' %
-                                     (argname, engine))
+                    raise ValueError('The %r option is not supported with the'
+                                     ' %r engine' % (argname, engine))
+            else:
+                value = default
             options[argname] = value
 
         if engine == 'python-fwf':
             for argname, default in compat.iteritems(_fwf_defaults):
-                if argname in kwds:
-                    value = kwds[argname]
-                options[argname] = value
+                options[argname] = kwds.get(argname, default)
 
         return options
 
     def _clean_options(self, options, engine):
         result = options.copy()
 
         sep = options['delimiter']
-        if (sep is None and not options['delim_whitespace']):
+        delim_whitespace = options['delim_whitespace']
+
+        if sep is None and not delim_whitespace:
             if engine == 'c':
                 print('Using Python parser to sniff delimiter')
                 engine = 'python'
@@ -667,21 +662,24 @@ def __init__(self, kwds):
         self.header = kwds.get('header')
         if isinstance(self.header,(list,tuple,np.ndarray)):
             if kwds.get('as_recarray'):
-                raise Exception("cannot specify as_recarray when "
-                                "specifying a multi-index header")
+                raise ValueError("cannot specify as_recarray when "
+                                 "specifying a multi-index header")
             if kwds.get('usecols'):
-                raise Exception("cannot specify usecols when "
-                                "specifying a multi-index header")
+                raise ValueError("cannot specify usecols when "
+                                 "specifying a multi-index header")
             if kwds.get('names'):
-                raise Exception("cannot specify names when "
-                                "specifying a multi-index header")
+                raise ValueError("cannot specify names when "
+                                 "specifying a multi-index header")
 
             # validate index_col that only contains integers
             if self.index_col is not None:
-                if not (isinstance(self.index_col,(list,tuple,np.ndarray)) and all(
-                        [ com.is_integer(i) for i in self.index_col ]) or com.is_integer(self.index_col)):
-                    raise Exception("index_col must only contain row numbers "
-                                    "when specifying a multi-index header")
+                is_sequence = isinstance(self.index_col, (list, tuple,
+                                                          np.ndarray))
+                if not (is_sequence and
+                        all(map(com.is_integer, self.index_col)) or
+                        com.is_integer(self.index_col)):
+                    raise ValueError("index_col must only contain row numbers "
+                                     "when specifying a multi-index header")
 
         self._name_processed = False
 
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
@@ -1230,17 +1230,17 @@ def test_header_multi_index(self):
         #### invalid options ####
 
         # no as_recarray
-        self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=[0,1], as_recarray=True, tupleize_cols=False)
 
         # names
-        self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=[0,1], names=['foo','bar'], tupleize_cols=False)
         # usecols
-        self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False)
         # non-numeric index_col
-        self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
+        self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
                           index_col=['foo','bar'], tupleize_cols=False)
 
     def test_pass_names_with_index(self):
@@ -2715,6 +2715,24 @@ def test_warn_if_chunks_have_mismatched_type(self):
             df = self.read_csv(StringIO(data))
         self.assertEqual(df.a.dtype, np.object)
 
+    def test_invalid_c_parser_opts_with_not_c_parser(self):
+        from pandas.io.parsers import _c_parser_defaults as c_defaults
+
+        data = """1,2,3,,
+1,2,3,4,
+1,2,3,4,5
+1,2,,,
+1,2,3,4,"""
+
+        engines = 'python', 'python-fwf'
+        for default in c_defaults:
+            for engine in engines:
+                kwargs = {default: object()}
+                with tm.assertRaisesRegexp(ValueError,
+                                           'The %r option is not supported '
+                                           'with the %r engine' % (default,
+                                                                   engine)):
+                    read_csv(StringIO(data), engine=engine, **kwargs)
 
 class TestParseSQL(unittest.TestCase):
 
@@ -2783,7 +2801,7 @@ def test_convert_sql_column_decimals(self):
 
 
 def assert_same_values_and_dtype(res, exp):
-    assert(res.dtype == exp.dtype)
+    tm.assert_equal(res.dtype, exp.dtype)
     tm.assert_almost_equal(res, exp)
 
 
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
@@ -237,7 +237,7 @@ cdef class TextReader:
     cdef:
         parser_t *parser
         object file_handle, na_fvalues
-        bint factorize, na_filter, verbose, has_usecols, has_mi_columns
+        bint na_filter, verbose, has_usecols, has_mi_columns
         int parser_start
         list clocks
         char *c_encoding
@@ -276,7 +276,6 @@ cdef class TextReader:
 
                   converters=None,
 
-                  factorize=True,
                   as_recarray=False,
 
                   skipinitialspace=False,
@@ -338,8 +337,6 @@ cdef class TextReader:
                 raise ValueError('only length-1 separators excluded right now')
             self.parser.delimiter = ord(delimiter)
 
-        self.factorize = factorize
-
         #----------------------------------------
         # parser options