Skip to content

CLN: clean up parser options #5121

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Oct 5, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,8 @@ Bug Fixes
- Fixed a bug where ``groupby.plot()`` and friends were duplicating figures
multiple times (:issue:`5102`).
- Provide automatic conversion of ``object`` dtypes on fillna, related (:issue:`5103`)
- Fixed a bug where default options were being overwritten in the option
parser cleaning (:issue:`5121`).


pandas 0.12.0
Expand Down
56 changes: 27 additions & 29 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,10 @@
Module contains tools for processing files into DataFrames or other objects
"""
from __future__ import print_function
from pandas.compat import range, lrange, StringIO, lzip, zip, string_types
from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map
from pandas import compat
import re
import csv
from warnings import warn

import numpy as np

Expand Down Expand Up @@ -266,7 +265,6 @@ def _read(filepath_or_buffer, kwds):
'buffer_lines': None,
'error_bad_lines': True,
'warn_bad_lines': True,
'factorize': True,
'dtype': None,
'decimal': b'.'
}
Expand Down Expand Up @@ -340,8 +338,7 @@ def parser_f(filepath_or_buffer,
encoding=None,
squeeze=False,
mangle_dupe_cols=True,
tupleize_cols=False,
):
tupleize_cols=False):

# Alias sep -> delimiter.
if delimiter is None:
Expand Down Expand Up @@ -400,8 +397,7 @@ def parser_f(filepath_or_buffer,
low_memory=low_memory,
buffer_lines=buffer_lines,
mangle_dupe_cols=mangle_dupe_cols,
tupleize_cols=tupleize_cols,
)
tupleize_cols=tupleize_cols)

return _read(filepath_or_buffer, kwds)

Expand Down Expand Up @@ -490,35 +486,34 @@ def _get_options_with_defaults(self, engine):
kwds = self.orig_options

options = {}
for argname, default in compat.iteritems(_parser_defaults):
if argname in kwds:
value = kwds[argname]
else:
value = default

options[argname] = value
for argname, default in compat.iteritems(_parser_defaults):
options[argname] = kwds.get(argname, default)

for argname, default in compat.iteritems(_c_parser_defaults):
if argname in kwds:
value = kwds[argname]

if engine != 'c' and value != default:
raise ValueError('%s is not supported with %s parser' %
(argname, engine))
raise ValueError('The %r option is not supported with the'
' %r engine' % (argname, engine))
else:
value = default
options[argname] = value

if engine == 'python-fwf':
for argname, default in compat.iteritems(_fwf_defaults):
if argname in kwds:
value = kwds[argname]
options[argname] = value
options[argname] = kwds.get(argname, default)

return options

def _clean_options(self, options, engine):
result = options.copy()

sep = options['delimiter']
if (sep is None and not options['delim_whitespace']):
delim_whitespace = options['delim_whitespace']

if sep is None and not delim_whitespace:
if engine == 'c':
print('Using Python parser to sniff delimiter')
engine = 'python'
Expand Down Expand Up @@ -667,21 +662,24 @@ def __init__(self, kwds):
self.header = kwds.get('header')
if isinstance(self.header,(list,tuple,np.ndarray)):
if kwds.get('as_recarray'):
raise Exception("cannot specify as_recarray when "
"specifying a multi-index header")
raise ValueError("cannot specify as_recarray when "
"specifying a multi-index header")
if kwds.get('usecols'):
raise Exception("cannot specify usecols when "
"specifying a multi-index header")
raise ValueError("cannot specify usecols when "
"specifying a multi-index header")
if kwds.get('names'):
raise Exception("cannot specify names when "
"specifying a multi-index header")
raise ValueError("cannot specify names when "
"specifying a multi-index header")

# validate index_col that only contains integers
if self.index_col is not None:
if not (isinstance(self.index_col,(list,tuple,np.ndarray)) and all(
[ com.is_integer(i) for i in self.index_col ]) or com.is_integer(self.index_col)):
raise Exception("index_col must only contain row numbers "
"when specifying a multi-index header")
is_sequence = isinstance(self.index_col, (list, tuple,
np.ndarray))
if not (is_sequence and
all(map(com.is_integer, self.index_col)) or
com.is_integer(self.index_col)):
raise ValueError("index_col must only contain row numbers "
"when specifying a multi-index header")

self._name_processed = False

Expand Down
28 changes: 23 additions & 5 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1230,17 +1230,17 @@ def test_header_multi_index(self):
#### invalid options ####

# no as_recarray
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
index_col=[0,1], as_recarray=True, tupleize_cols=False)

# names
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
index_col=[0,1], names=['foo','bar'], tupleize_cols=False)
# usecols
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False)
# non-numeric index_col
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
index_col=['foo','bar'], tupleize_cols=False)

def test_pass_names_with_index(self):
Expand Down Expand Up @@ -2715,6 +2715,24 @@ def test_warn_if_chunks_have_mismatched_type(self):
df = self.read_csv(StringIO(data))
self.assertEqual(df.a.dtype, np.object)

def test_invalid_c_parser_opts_with_not_c_parser(self):
from pandas.io.parsers import _c_parser_defaults as c_defaults

data = """1,2,3,,
1,2,3,4,
1,2,3,4,5
1,2,,,
1,2,3,4,"""

engines = 'python', 'python-fwf'
for default in c_defaults:
for engine in engines:
kwargs = {default: object()}
with tm.assertRaisesRegexp(ValueError,
'The %r option is not supported '
'with the %r engine' % (default,
engine)):
read_csv(StringIO(data), engine=engine, **kwargs)

class TestParseSQL(unittest.TestCase):

Expand Down Expand Up @@ -2783,7 +2801,7 @@ def test_convert_sql_column_decimals(self):


def assert_same_values_and_dtype(res, exp):
assert(res.dtype == exp.dtype)
tm.assert_equal(res.dtype, exp.dtype)
tm.assert_almost_equal(res, exp)


Expand Down
5 changes: 1 addition & 4 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ cdef class TextReader:
cdef:
parser_t *parser
object file_handle, na_fvalues
bint factorize, na_filter, verbose, has_usecols, has_mi_columns
bint na_filter, verbose, has_usecols, has_mi_columns
int parser_start
list clocks
char *c_encoding
Expand Down Expand Up @@ -276,7 +276,6 @@ cdef class TextReader:

converters=None,

factorize=True,
as_recarray=False,

skipinitialspace=False,
Expand Down Expand Up @@ -338,8 +337,6 @@ cdef class TextReader:
raise ValueError('only length-1 separators excluded right now')
self.parser.delimiter = ord(delimiter)

self.factorize = factorize

#----------------------------------------
# parser options

Expand Down