From 9f5e5ff9a5643bc535476f2435c7a40570070bcb Mon Sep 17 00:00:00 2001 From: Viktor Kerkez Date: Sat, 28 Sep 2013 17:48:01 +0200 Subject: [PATCH] ENH: Added automatic colspecs detection to read_fwf (GH4488) Implemented an algorithm that uses a bitmask to detect the gaps between the columns. The reader buffers the lines used for detection in case the input stream is not seekable. --- .gitignore | 1 + doc/source/io.rst | 23 +++++-- doc/source/release.rst | 1 + doc/source/v0.13.0.txt | 3 + pandas/io/parsers.py | 98 +++++++++++++++++++--------- pandas/io/tests/test_parsers.py | 111 ++++++++++++++++++++++++++++++-- 6 files changed, 199 insertions(+), 38 deletions(-) diff --git a/.gitignore b/.gitignore index df7002a79d974..da76a414865e5 100644 --- a/.gitignore +++ b/.gitignore @@ -41,3 +41,4 @@ pandas/io/*.json .project .pydevproject +.settings diff --git a/doc/source/io.rst b/doc/source/io.rst index 01795f6a4a9bf..5e04fcff61539 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -742,10 +742,13 @@ function works with data files that have known and fixed column widths. The function parameters to ``read_fwf`` are largely the same as `read_csv` with two extra parameters: - - ``colspecs``: a list of pairs (tuples), giving the extents of the - fixed-width fields of each line as half-open intervals [from, to[ - - ``widths``: a list of field widths, which can be used instead of - ``colspecs`` if the intervals are contiguous + - ``colspecs``: A list of pairs (tuples) giving the extents of the + fixed-width fields of each line as half-open intervals (i.e., [from, to[ ). + String value 'infer' can be used to instruct the parser to try detecting + the column specifications from the first 100 rows of the data. Default + behaviour, if not specified, is to infer. + - ``widths``: A list of field widths which can be used instead of 'colspecs' + if the intervals are contiguous. .. ipython:: python :suppress: @@ -789,6 +792,18 @@ column widths for contiguous columns: The parser will take care of extra white spaces around the columns so it's ok to have extra separation between the columns in the file. +.. versionadded:: 0.13.0 + +By default, ``read_fwf`` will try to infer the file's ``colspecs`` by using the +first 100 rows of the file. It can do it only in cases when the columns are +aligned and correctly separated by the provided ``delimiter`` (default delimiter +is whitespace). + +.. ipython:: python + + df = pd.read_fwf('bar.csv', header=None, index_col=0) + df + .. ipython:: python :suppress: diff --git a/doc/source/release.rst b/doc/source/release.rst index f3f86dec92502..177381346e2d1 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -59,6 +59,7 @@ New features - Added ``isin`` method to DataFrame (:issue:`4211`) - Clipboard functionality now works with PySide (:issue:`4282`) - New ``extract`` string method returns regex matches more conveniently (:issue:`4685`) + - Auto-detect field widths in read_fwf when unspecified (:issue:`4488`) Experimental Features ~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt index 0796f34ead839..0e3c3b50fcd85 100644 --- a/doc/source/v0.13.0.txt +++ b/doc/source/v0.13.0.txt @@ -421,6 +421,9 @@ Enhancements can also be used. - ``read_stata` now accepts Stata 13 format (:issue:`4291`) + - ``read_fwf`` now infers the column specifications from the first 100 rows of + the file if the data has correctly separated and properly aligned columns + using the delimiter provided to the function (:issue:`4488`). .. _whatsnew_0130.experimental: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index e0b12277f4416..3ef3cbf856fef 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -160,11 +160,15 @@ """ % (_parser_params % _table_sep) _fwf_widths = """\ -colspecs : a list of pairs (tuples), giving the extents - of the fixed-width fields of each line as half-open internals - (i.e., [from, to[ ). -widths : a list of field widths, which can be used instead of - 'colspecs' if the intervals are contiguous. +colspecs : list of pairs (int, int) or 'infer'. optional + A list of pairs (tuples) giving the extents of the fixed-width + fields of each line as half-open intervals (i.e., [from, to[ ). + String value 'infer' can be used to instruct the parser to try + detecting the column specifications from the first 100 rows of + the data (default='infer'). +widths : list of ints. optional + A list of field widths which can be used instead of 'colspecs' if + the intervals are contiguous. """ _read_fwf_doc = """ @@ -184,7 +188,8 @@ def _read(filepath_or_buffer, kwds): if skipfooter is not None: kwds['skip_footer'] = skipfooter - filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer) + filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer, + encoding) if kwds.get('date_parser', None) is not None: if isinstance(kwds['parse_dates'], bool): @@ -267,8 +272,8 @@ def _read(filepath_or_buffer, kwds): } _fwf_defaults = { - 'colspecs': None, - 'widths': None + 'colspecs': 'infer', + 'widths': None, } _c_unsupported = set(['skip_footer']) @@ -412,15 +417,15 @@ def parser_f(filepath_or_buffer, @Appender(_read_fwf_doc) -def read_fwf(filepath_or_buffer, colspecs=None, widths=None, **kwds): +def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds): # Check input arguments. if colspecs is None and widths is None: raise ValueError("Must specify either colspecs or widths") - elif colspecs is not None and widths is not None: + elif colspecs not in (None, 'infer') and widths is not None: raise ValueError("You must specify only one of 'widths' and " "'colspecs'") - # Compute 'colspec' from 'widths', if specified. + # Compute 'colspecs' from 'widths', if specified. if widths is not None: colspecs, col = [], 0 for w in widths: @@ -519,7 +524,8 @@ def _clean_options(self, options, engine): engine = 'python' elif sep is not None and len(sep) > 1: # wait until regex engine integrated - engine = 'python' + if engine not in ('python', 'python-fwf'): + engine = 'python' # C engine not supported yet if engine == 'c': @@ -2012,31 +2018,65 @@ class FixedWidthReader(object): """ A reader of fixed-width lines. """ - def __init__(self, f, colspecs, filler, thousands=None, encoding=None): + def __init__(self, f, colspecs, delimiter, comment): self.f = f - self.colspecs = colspecs - self.filler = filler # Empty characters between fields. - self.thousands = thousands - if encoding is None: - encoding = get_option('display.encoding') - self.encoding = encoding - - if not isinstance(colspecs, (tuple, list)): + self.buffer = None + self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t ' + self.comment = comment + if colspecs == 'infer': + self.colspecs = self.detect_colspecs() + else: + self.colspecs = colspecs + + if not isinstance(self.colspecs, (tuple, list)): raise TypeError("column specifications must be a list or tuple, " "input was a %r" % type(colspecs).__name__) - for colspec in colspecs: + for colspec in self.colspecs: if not (isinstance(colspec, (tuple, list)) and - len(colspec) == 2 and - isinstance(colspec[0], int) and - isinstance(colspec[1], int)): + len(colspec) == 2 and + isinstance(colspec[0], (int, np.integer)) and + isinstance(colspec[1], (int, np.integer))): raise TypeError('Each column specification must be ' '2 element tuple or list of integers') + def get_rows(self, n): + rows = [] + for i, row in enumerate(self.f, 1): + rows.append(row) + if i >= n: + break + self.buffer = iter(rows) + return rows + + def detect_colspecs(self, n=100): + # Regex escape the delimiters + delimiters = ''.join([r'\%s' % x for x in self.delimiter]) + pattern = re.compile('([^%s]+)' % delimiters) + rows = self.get_rows(n) + max_len = max(map(len, rows)) + mask = np.zeros(max_len + 1, dtype=int) + if self.comment is not None: + rows = [row.partition(self.comment)[0] for row in rows] + for row in rows: + for m in pattern.finditer(row): + mask[m.start():m.end()] = 1 + shifted = np.roll(mask, 1) + shifted[0] = 0 + edges = np.where((mask ^ shifted) == 1)[0] + return list(zip(edges[::2], edges[1::2])) + def next(self): - line = next(self.f) + if self.buffer is not None: + try: + line = next(self.buffer) + except StopIteration: + self.buffer = None + line = next(self.f) + else: + line = next(self.f) # Note: 'colspecs' is a sequence of half-open intervals. - return [line[fromm:to].strip(self.filler or ' ') + return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs] # Iterator protocol in Python 3 uses __next__() @@ -2050,10 +2090,10 @@ class FixedWidthFieldParser(PythonParser): """ def __init__(self, f, **kwds): # Support iterators, convert to a list. - self.colspecs = list(kwds.pop('colspecs')) + self.colspecs = kwds.pop('colspecs') PythonParser.__init__(self, f, **kwds) def _make_reader(self, f): self.data = FixedWidthReader(f, self.colspecs, self.delimiter, - encoding=self.encoding) + self.comment) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 4e0c00c8a31eb..44e40dc34ff25 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1706,7 +1706,7 @@ def test_utf16_example(self): self.assertEquals(len(result), 50) def test_converters_corner_with_nas(self): - # skip aberration observed on Win64 Python 3.2.2 + # skip aberration observed on Win64 Python 3.2.2 if hash(np.int64(-1)) != -2: raise nose.SkipTest("skipping because of windows hash on Python" " 3.2.2") @@ -2078,19 +2078,19 @@ def test_fwf(self): read_fwf(StringIO(data3), colspecs=colspecs, widths=[6, 10, 10, 7]) with tm.assertRaisesRegexp(ValueError, "Must specify either"): - read_fwf(StringIO(data3)) + read_fwf(StringIO(data3), colspecs=None, widths=None) def test_fwf_colspecs_is_list_or_tuple(self): with tm.assertRaisesRegexp(TypeError, 'column specifications must be a list or ' 'tuple.+'): - fwr = pd.io.parsers.FixedWidthReader(StringIO(self.data1), - {'a': 1}, ',') + pd.io.parsers.FixedWidthReader(StringIO(self.data1), + {'a': 1}, ',', '#') def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self): with tm.assertRaisesRegexp(TypeError, 'Each column specification must be.+'): - read_fwf(StringIO(self.data1), {'a': 1}) + read_fwf(StringIO(self.data1), [('a', 1)]) def test_fwf_regression(self): # GH 3594 @@ -2223,6 +2223,107 @@ def test_iteration_open_handle(self): expected = Series(['DDD', 'EEE', 'FFF', 'GGG']) tm.assert_series_equal(result, expected) + +class TestFwfColspaceSniffing(unittest.TestCase): + def test_full_file(self): + # File with all values + test = '''index A B C +2000-01-03T00:00:00 0.980268513777 3 foo +2000-01-04T00:00:00 1.04791624281 -4 bar +2000-01-05T00:00:00 0.498580885705 73 baz +2000-01-06T00:00:00 1.12020151869 1 foo +2000-01-07T00:00:00 0.487094399463 0 bar +2000-01-10T00:00:00 0.836648671666 2 baz +2000-01-11T00:00:00 0.157160753327 34 foo''' + colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_full_file_with_missing(self): + # File with missing values + test = '''index A B C +2000-01-03T00:00:00 0.980268513777 3 foo +2000-01-04T00:00:00 1.04791624281 -4 bar + 0.498580885705 73 baz +2000-01-06T00:00:00 1.12020151869 1 foo +2000-01-07T00:00:00 0 bar +2000-01-10T00:00:00 0.836648671666 2 baz + 34''' + colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_full_file_with_spaces(self): + # File with spaces in columns + test = ''' +Account Name Balance CreditLimit AccountCreated +101 Keanu Reeves 9315.45 10000.00 1/17/1998 +312 Gerard Butler 90.00 1000.00 8/6/2003 +868 Jennifer Love Hewitt 0 17000.00 5/25/1985 +761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 +317 Bill Murray 789.65 5000.00 2/5/2007 +'''.strip('\r\n') + colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_full_file_with_spaces_and_missing(self): + # File with spaces and missing values in columsn + test = ''' +Account Name Balance CreditLimit AccountCreated +101 10000.00 1/17/1998 +312 Gerard Butler 90.00 1000.00 8/6/2003 +868 5/25/1985 +761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 +317 Bill Murray 789.65 +'''.strip('\r\n') + colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_messed_up_data(self): + # Completely messed up file + test = ''' + Account Name Balance Credit Limit Account Created + 101 10000.00 1/17/1998 + 312 Gerard Butler 90.00 1000.00 + + 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 + 317 Bill Murray 789.65 +'''.strip('\r\n') + colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) + expected = read_fwf(StringIO(test), colspecs=colspecs) + tm.assert_frame_equal(expected, read_fwf(StringIO(test))) + + def test_multiple_delimiters(self): + test = r''' +col1~~~~~col2 col3++++++++++++++++++col4 +~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves + 33+++122.33\\\bar.........Gerard Butler +++44~~~~12.01 baz~~Jennifer Love Hewitt +~~55 11+++foo++++Jada Pinkett-Smith +..66++++++.03~~~bar Bill Murray +'''.strip('\r\n') + colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) + expected = read_fwf(StringIO(test), colspecs=colspecs, + delimiter=' +~.\\') + tm.assert_frame_equal(expected, read_fwf(StringIO(test), + delimiter=' +~.\\')) + + def test_variable_width_unicode(self): + if not compat.PY3: + raise nose.SkipTest('Bytes-related test - only needs to work on Python 3') + test = ''' +שלום שלום +ום שלל +של ום +'''.strip('\r\n') + expected = pd.read_fwf(BytesIO(test.encode('utf8')), + colspecs=[(0, 4), (5, 9)], header=None) + tm.assert_frame_equal(expected, read_fwf(BytesIO(test.encode('utf8')), + header=None)) + + class TestCParserHighMemory(ParserTests, unittest.TestCase): def read_csv(self, *args, **kwds):