Skip to content

Commit bea5051

Browse files
committed
Merge pull request #5121 from cpcloud/parser-options-mania
CLN: clean up parser options
2 parents 0a71f7a + 1f82496 commit bea5051

File tree

4 files changed

+53
-38
lines changed

4 files changed

+53
-38
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -564,6 +564,8 @@ Bug Fixes
564564
- Fixed a bug where ``groupby.plot()`` and friends were duplicating figures
565565
multiple times (:issue:`5102`).
566566
- Provide automatic conversion of ``object`` dtypes on fillna, related (:issue:`5103`)
567+
- Fixed a bug where default options were being overwritten in the option
568+
parser cleaning (:issue:`5121`).
567569

568570

569571
pandas 0.12.0

pandas/io/parsers.py

+27-29
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,10 @@
22
Module contains tools for processing files into DataFrames or other objects
33
"""
44
from __future__ import print_function
5-
from pandas.compat import range, lrange, StringIO, lzip, zip, string_types
5+
from pandas.compat import range, lrange, StringIO, lzip, zip, string_types, map
66
from pandas import compat
77
import re
88
import csv
9-
from warnings import warn
109

1110
import numpy as np
1211

@@ -266,7 +265,6 @@ def _read(filepath_or_buffer, kwds):
266265
'buffer_lines': None,
267266
'error_bad_lines': True,
268267
'warn_bad_lines': True,
269-
'factorize': True,
270268
'dtype': None,
271269
'decimal': b'.'
272270
}
@@ -340,8 +338,7 @@ def parser_f(filepath_or_buffer,
340338
encoding=None,
341339
squeeze=False,
342340
mangle_dupe_cols=True,
343-
tupleize_cols=False,
344-
):
341+
tupleize_cols=False):
345342

346343
# Alias sep -> delimiter.
347344
if delimiter is None:
@@ -400,8 +397,7 @@ def parser_f(filepath_or_buffer,
400397
low_memory=low_memory,
401398
buffer_lines=buffer_lines,
402399
mangle_dupe_cols=mangle_dupe_cols,
403-
tupleize_cols=tupleize_cols,
404-
)
400+
tupleize_cols=tupleize_cols)
405401

406402
return _read(filepath_or_buffer, kwds)
407403

@@ -490,35 +486,34 @@ def _get_options_with_defaults(self, engine):
490486
kwds = self.orig_options
491487

492488
options = {}
493-
for argname, default in compat.iteritems(_parser_defaults):
494-
if argname in kwds:
495-
value = kwds[argname]
496-
else:
497-
value = default
498489

499-
options[argname] = value
490+
for argname, default in compat.iteritems(_parser_defaults):
491+
options[argname] = kwds.get(argname, default)
500492

501493
for argname, default in compat.iteritems(_c_parser_defaults):
502494
if argname in kwds:
503495
value = kwds[argname]
496+
504497
if engine != 'c' and value != default:
505-
raise ValueError('%s is not supported with %s parser' %
506-
(argname, engine))
498+
raise ValueError('The %r option is not supported with the'
499+
' %r engine' % (argname, engine))
500+
else:
501+
value = default
507502
options[argname] = value
508503

509504
if engine == 'python-fwf':
510505
for argname, default in compat.iteritems(_fwf_defaults):
511-
if argname in kwds:
512-
value = kwds[argname]
513-
options[argname] = value
506+
options[argname] = kwds.get(argname, default)
514507

515508
return options
516509

517510
def _clean_options(self, options, engine):
518511
result = options.copy()
519512

520513
sep = options['delimiter']
521-
if (sep is None and not options['delim_whitespace']):
514+
delim_whitespace = options['delim_whitespace']
515+
516+
if sep is None and not delim_whitespace:
522517
if engine == 'c':
523518
print('Using Python parser to sniff delimiter')
524519
engine = 'python'
@@ -667,21 +662,24 @@ def __init__(self, kwds):
667662
self.header = kwds.get('header')
668663
if isinstance(self.header,(list,tuple,np.ndarray)):
669664
if kwds.get('as_recarray'):
670-
raise Exception("cannot specify as_recarray when "
671-
"specifying a multi-index header")
665+
raise ValueError("cannot specify as_recarray when "
666+
"specifying a multi-index header")
672667
if kwds.get('usecols'):
673-
raise Exception("cannot specify usecols when "
674-
"specifying a multi-index header")
668+
raise ValueError("cannot specify usecols when "
669+
"specifying a multi-index header")
675670
if kwds.get('names'):
676-
raise Exception("cannot specify names when "
677-
"specifying a multi-index header")
671+
raise ValueError("cannot specify names when "
672+
"specifying a multi-index header")
678673

679674
# validate index_col that only contains integers
680675
if self.index_col is not None:
681-
if not (isinstance(self.index_col,(list,tuple,np.ndarray)) and all(
682-
[ com.is_integer(i) for i in self.index_col ]) or com.is_integer(self.index_col)):
683-
raise Exception("index_col must only contain row numbers "
684-
"when specifying a multi-index header")
676+
is_sequence = isinstance(self.index_col, (list, tuple,
677+
np.ndarray))
678+
if not (is_sequence and
679+
all(map(com.is_integer, self.index_col)) or
680+
com.is_integer(self.index_col)):
681+
raise ValueError("index_col must only contain row numbers "
682+
"when specifying a multi-index header")
685683

686684
self._name_processed = False
687685

pandas/io/tests/test_parsers.py

+23-5
Original file line numberDiff line numberDiff line change
@@ -1230,17 +1230,17 @@ def test_header_multi_index(self):
12301230
#### invalid options ####
12311231

12321232
# no as_recarray
1233-
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
1233+
self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
12341234
index_col=[0,1], as_recarray=True, tupleize_cols=False)
12351235

12361236
# names
1237-
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
1237+
self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
12381238
index_col=[0,1], names=['foo','bar'], tupleize_cols=False)
12391239
# usecols
1240-
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
1240+
self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
12411241
index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False)
12421242
# non-numeric index_col
1243-
self.assertRaises(Exception, read_csv, StringIO(data), header=[0,1,2,3],
1243+
self.assertRaises(ValueError, read_csv, StringIO(data), header=[0,1,2,3],
12441244
index_col=['foo','bar'], tupleize_cols=False)
12451245

12461246
def test_pass_names_with_index(self):
@@ -2715,6 +2715,24 @@ def test_warn_if_chunks_have_mismatched_type(self):
27152715
df = self.read_csv(StringIO(data))
27162716
self.assertEqual(df.a.dtype, np.object)
27172717

2718+
def test_invalid_c_parser_opts_with_not_c_parser(self):
2719+
from pandas.io.parsers import _c_parser_defaults as c_defaults
2720+
2721+
data = """1,2,3,,
2722+
1,2,3,4,
2723+
1,2,3,4,5
2724+
1,2,,,
2725+
1,2,3,4,"""
2726+
2727+
engines = 'python', 'python-fwf'
2728+
for default in c_defaults:
2729+
for engine in engines:
2730+
kwargs = {default: object()}
2731+
with tm.assertRaisesRegexp(ValueError,
2732+
'The %r option is not supported '
2733+
'with the %r engine' % (default,
2734+
engine)):
2735+
read_csv(StringIO(data), engine=engine, **kwargs)
27182736

27192737
class TestParseSQL(unittest.TestCase):
27202738

@@ -2783,7 +2801,7 @@ def test_convert_sql_column_decimals(self):
27832801

27842802

27852803
def assert_same_values_and_dtype(res, exp):
2786-
assert(res.dtype == exp.dtype)
2804+
tm.assert_equal(res.dtype, exp.dtype)
27872805
tm.assert_almost_equal(res, exp)
27882806

27892807

pandas/parser.pyx

+1-4
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ cdef class TextReader:
237237
cdef:
238238
parser_t *parser
239239
object file_handle, na_fvalues
240-
bint factorize, na_filter, verbose, has_usecols, has_mi_columns
240+
bint na_filter, verbose, has_usecols, has_mi_columns
241241
int parser_start
242242
list clocks
243243
char *c_encoding
@@ -276,7 +276,6 @@ cdef class TextReader:
276276

277277
converters=None,
278278

279-
factorize=True,
280279
as_recarray=False,
281280

282281
skipinitialspace=False,
@@ -338,8 +337,6 @@ cdef class TextReader:
338337
raise ValueError('only length-1 separators excluded right now')
339338
self.parser.delimiter = ord(delimiter)
340339

341-
self.factorize = factorize
342-
343340
#----------------------------------------
344341
# parser options
345342

0 commit comments

Comments
 (0)