|
6 | 6 | from pandas import compat
|
7 | 7 | import re
|
8 | 8 | import csv
|
| 9 | +import warnings |
9 | 10 |
|
10 | 11 | import numpy as np
|
11 | 12 |
|
|
24 | 25 | import pandas.tslib as tslib
|
25 | 26 | import pandas.parser as _parser
|
26 | 27 |
|
| 28 | +class ParserWarning(Warning): |
| 29 | + pass |
27 | 30 |
|
28 | 31 | _parser_params = """Also supports optionally iterating or breaking of the file
|
29 | 32 | into chunks.
|
|
50 | 53 | One-character string used to escape delimiter when quoting is QUOTE_NONE.
|
51 | 54 | dtype : Type name or dict of column -> type
|
52 | 55 | Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
|
| 56 | + (Unsupported with engine='python') |
53 | 57 | compression : {'gzip', 'bz2', None}, default None
|
54 | 58 | For on-the-fly decompression of on-disk data
|
55 | 59 | dialect : string or csv.Dialect instance, default None
|
|
113 | 117 | chunksize : int, default None
|
114 | 118 | Return TextFileReader object for iteration
|
115 | 119 | skipfooter : int, default 0
|
116 |
| - Number of line at bottom of file to skip |
| 120 | + Number of lines at bottom of file to skip (Unsupported with engine='c') |
117 | 121 | converters : dict. optional
|
118 | 122 | Dict of functions for converting values in certain columns. Keys can either
|
119 | 123 | be integers or column labels
|
|
125 | 129 | Encoding to use for UTF when reading/writing (ex. 'utf-8')
|
126 | 130 | squeeze : boolean, default False
|
127 | 131 | If the parsed data only contains one column then return a Series
|
128 |
| -na_filter: boolean, default True |
| 132 | +na_filter : boolean, default True |
129 | 133 | Detect missing value markers (empty strings and the value of na_values). In
|
130 | 134 | data without any NAs, passing na_filter=False can improve the performance
|
131 | 135 | of reading a large file
|
132 | 136 | usecols : array-like
|
133 | 137 | Return a subset of the columns.
|
134 | 138 | Results in much faster parsing time and lower memory usage.
|
135 |
| -mangle_dupe_cols: boolean, default True |
| 139 | +mangle_dupe_cols : boolean, default True |
136 | 140 | Duplicate columns will be specified as 'X.0'...'X.N', rather than 'X'...'X'
|
137 |
| -tupleize_cols: boolean, default False |
| 141 | +tupleize_cols : boolean, default False |
138 | 142 | Leave a list of tuples on columns as is (default is to convert to
|
139 | 143 | a Multi Index on the columns)
|
140 |
| -error_bad_lines: boolean, default True |
| 144 | +error_bad_lines : boolean, default True |
141 | 145 | Lines with too many fields (e.g. a csv line with too many commas) will by
|
142 | 146 | default cause an exception to be raised, and no DataFrame will be returned.
|
143 | 147 | If False, then these "bad lines" will dropped from the DataFrame that is
|
144 |
| - returned. (Only valid with C parser). |
145 |
| -warn_bad_lines: boolean, default True |
| 148 | + returned. (Only valid with C parser) |
| 149 | +warn_bad_lines : boolean, default True |
146 | 150 | If error_bad_lines is False, and warn_bad_lines is True, a warning for each
|
147 | 151 | "bad line" will be output. (Only valid with C parser).
|
148 | 152 | infer_datetime_format : boolean, default False
|
|
154 | 158 | result : DataFrame or TextParser
|
155 | 159 | """
|
156 | 160 |
|
157 |
| -_csv_sep = """sep : string, default ',' |
| 161 | +_csv_params = """sep : string, default ',' |
158 | 162 | Delimiter to use. If sep is None, will try to automatically determine
|
159 | 163 | this. Regular expressions are accepted.
|
160 |
| -""" |
| 164 | +engine : {'c', 'python'} |
| 165 | + Parser engine to use. The C engine is faster while the python engine is |
| 166 | + currently more feature-complete.""" |
161 | 167 |
|
162 |
| -_table_sep = """sep : string, default \\t (tab-stop) |
163 |
| - Delimiter to use. Regular expressions are accepted.""" |
| 168 | +_table_params = """sep : string, default \\t (tab-stop) |
| 169 | + Delimiter to use. Regular expressions are accepted. |
| 170 | +engine : {'c', 'python'} |
| 171 | + Parser engine to use. The C engine is faster while the python engine is |
| 172 | + currently more feature-complete.""" |
164 | 173 |
|
165 | 174 | _read_csv_doc = """
|
166 | 175 | Read CSV (comma-separated) file into DataFrame
|
167 | 176 |
|
168 | 177 | %s
|
169 |
| -""" % (_parser_params % _csv_sep) |
| 178 | +""" % (_parser_params % _csv_params) |
170 | 179 |
|
171 | 180 | _read_table_doc = """
|
172 | 181 | Read general delimited file into DataFrame
|
173 | 182 |
|
174 | 183 | %s
|
175 |
| -""" % (_parser_params % _table_sep) |
| 184 | +""" % (_parser_params % _table_params) |
176 | 185 |
|
177 | 186 | _fwf_widths = """\
|
178 | 187 | colspecs : list of pairs (int, int) or 'infer'. optional
|
@@ -297,6 +306,8 @@ def _read(filepath_or_buffer, kwds):
|
297 | 306 |
|
298 | 307 | def _make_parser_function(name, sep=','):
|
299 | 308 |
|
| 309 | + default_sep = sep |
| 310 | + |
300 | 311 | def parser_f(filepath_or_buffer,
|
301 | 312 | sep=sep,
|
302 | 313 | dialect=None,
|
@@ -325,7 +336,7 @@ def parser_f(filepath_or_buffer,
|
325 | 336 | dtype=None,
|
326 | 337 | usecols=None,
|
327 | 338 |
|
328 |
| - engine='c', |
| 339 | + engine=None, |
329 | 340 | delim_whitespace=False,
|
330 | 341 | as_recarray=False,
|
331 | 342 | na_filter=True,
|
@@ -362,10 +373,21 @@ def parser_f(filepath_or_buffer,
|
362 | 373 | if delimiter is None:
|
363 | 374 | delimiter = sep
|
364 | 375 |
|
| 376 | + if delim_whitespace and delimiter is not default_sep: |
| 377 | + raise ValueError("Specified a delimiter with both sep and"\ |
| 378 | + " delim_whitespace=True; you can only specify one.") |
| 379 | + |
| 380 | + if engine is not None: |
| 381 | + engine_specified = True |
| 382 | + else: |
| 383 | + engine = 'c' |
| 384 | + engine_specified = False |
| 385 | + |
365 | 386 | kwds = dict(delimiter=delimiter,
|
366 | 387 | engine=engine,
|
367 | 388 | dialect=dialect,
|
368 | 389 | compression=compression,
|
| 390 | + engine_specified=engine_specified, |
369 | 391 |
|
370 | 392 | doublequote=doublequote,
|
371 | 393 | escapechar=escapechar,
|
@@ -468,10 +490,18 @@ class TextFileReader(object):
|
468 | 490 |
|
469 | 491 | """
|
470 | 492 |
|
471 |
| - def __init__(self, f, engine='python', **kwds): |
| 493 | + def __init__(self, f, engine=None, **kwds): |
472 | 494 |
|
473 | 495 | self.f = f
|
474 | 496 |
|
| 497 | + if engine is not None: |
| 498 | + engine_specified = True |
| 499 | + else: |
| 500 | + engine = 'python' |
| 501 | + engine_specified = False |
| 502 | + |
| 503 | + self._engine_specified = kwds.get('engine_specified', engine_specified) |
| 504 | + |
475 | 505 | if kwds.get('dialect') is not None:
|
476 | 506 | dialect = kwds['dialect']
|
477 | 507 | kwds['delimiter'] = dialect.delimiter
|
@@ -530,30 +560,60 @@ def _get_options_with_defaults(self, engine):
|
530 | 560 | def _clean_options(self, options, engine):
|
531 | 561 | result = options.copy()
|
532 | 562 |
|
| 563 | + engine_specified = self._engine_specified |
| 564 | + fallback_reason = None |
| 565 | + |
533 | 566 | sep = options['delimiter']
|
534 | 567 | delim_whitespace = options['delim_whitespace']
|
535 | 568 |
|
| 569 | + # C engine not supported yet |
| 570 | + if engine == 'c': |
| 571 | + if options['skip_footer'] > 0: |
| 572 | + fallback_reason = "the 'c' engine does not support"\ |
| 573 | + " skip_footer" |
| 574 | + engine = 'python' |
| 575 | + |
536 | 576 | if sep is None and not delim_whitespace:
|
537 | 577 | if engine == 'c':
|
| 578 | + fallback_reason = "the 'c' engine does not support"\ |
| 579 | + " sep=None with delim_whitespace=False" |
538 | 580 | engine = 'python'
|
539 | 581 | elif sep is not None and len(sep) > 1:
|
540 |
| - # wait until regex engine integrated |
541 |
| - if engine not in ('python', 'python-fwf'): |
| 582 | + if engine == 'c' and sep == '\s+': |
| 583 | + result['delim_whitespace'] = True |
| 584 | + del result['delimiter'] |
| 585 | + elif engine not in ('python', 'python-fwf'): |
| 586 | + # wait until regex engine integrated |
| 587 | + fallback_reason = "the 'c' engine does not support"\ |
| 588 | + " regex separators" |
542 | 589 | engine = 'python'
|
543 | 590 |
|
544 |
| - # C engine not supported yet |
545 |
| - if engine == 'c': |
546 |
| - if options['skip_footer'] > 0: |
547 |
| - engine = 'python' |
| 591 | + if fallback_reason and engine_specified: |
| 592 | + raise ValueError(fallback_reason) |
548 | 593 |
|
549 | 594 | if engine == 'c':
|
550 | 595 | for arg in _c_unsupported:
|
551 | 596 | del result[arg]
|
552 | 597 |
|
553 | 598 | if 'python' in engine:
|
554 | 599 | for arg in _python_unsupported:
|
| 600 | + if fallback_reason and result[arg] != _c_parser_defaults[arg]: |
| 601 | + msg = ("Falling back to the 'python' engine because" |
| 602 | + " {reason}, but this causes {option!r} to be" |
| 603 | + " ignored as it is not supported by the 'python'" |
| 604 | + " engine.").format(reason=fallback_reason, option=arg) |
| 605 | + if arg == 'dtype': |
| 606 | + msg += " (Note the 'converters' option provides"\ |
| 607 | + " similar functionality.)" |
| 608 | + raise ValueError(msg) |
555 | 609 | del result[arg]
|
556 | 610 |
|
| 611 | + if fallback_reason: |
| 612 | + warnings.warn(("Falling back to the 'python' engine because" |
| 613 | + " {0}; you can avoid this warning by specifying" |
| 614 | + " engine='python'.").format(fallback_reason), |
| 615 | + ParserWarning) |
| 616 | + |
557 | 617 | index_col = options['index_col']
|
558 | 618 | names = options['names']
|
559 | 619 | converters = options['converters']
|
|
0 commit comments