From 9f5e5ff9a5643bc535476f2435c7a40570070bcb Mon Sep 17 00:00:00 2001
From: Viktor Kerkez <alefnula@gmail.com>
Date: Sat, 28 Sep 2013 17:48:01 +0200
Subject: [PATCH] ENH: Added automatic colspecs detection to read_fwf (GH4488)

Implemented an algorithm that uses a bitmask to detect the gaps between the columns.
The reader buffers the lines used for detection in case the input stream is not seekable.
---
 .gitignore                      |   1 +
 doc/source/io.rst               |  23 +++++--
 doc/source/release.rst          |   1 +
 doc/source/v0.13.0.txt          |   3 +
 pandas/io/parsers.py            |  98 +++++++++++++++++++---------
 pandas/io/tests/test_parsers.py | 111 ++++++++++++++++++++++++++++++--
 6 files changed, 199 insertions(+), 38 deletions(-)

diff --git a/.gitignore b/.gitignore
index df7002a79d974..da76a414865e5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -41,3 +41,4 @@ pandas/io/*.json
 
 .project
 .pydevproject
+.settings
diff --git a/doc/source/io.rst b/doc/source/io.rst
index 01795f6a4a9bf..5e04fcff61539 100644
--- a/doc/source/io.rst
+++ b/doc/source/io.rst
@@ -742,10 +742,13 @@ function works with data files that have known and fixed column widths.
 The function parameters to ``read_fwf`` are largely the same as `read_csv` with
 two extra parameters:
 
-  - ``colspecs``: a list of pairs (tuples), giving the extents of the
-    fixed-width fields of each line as half-open intervals [from, to[
-  - ``widths``: a list of field widths, which can be used instead of
-    ``colspecs`` if the intervals are contiguous
+  - ``colspecs``: A list of pairs (tuples) giving the extents of the
+    fixed-width fields of each line as half-open intervals (i.e.,  [from, to[ ).
+    String value 'infer' can be used to instruct the parser to try detecting
+    the column specifications from the first 100 rows of the data. Default
+    behaviour, if not specified, is to infer.
+  - ``widths``: A list of field widths which can be used instead of 'colspecs'
+    if the intervals are contiguous.
 
 .. ipython:: python
    :suppress:
@@ -789,6 +792,18 @@ column widths for contiguous columns:
 The parser will take care of extra white spaces around the columns
 so it's ok to have extra separation between the columns in the file.
 
+.. versionadded:: 0.13.0
+
+By default, ``read_fwf`` will try to infer the file's ``colspecs`` by using the
+first 100 rows of the file. It can do it only in cases when the columns are
+aligned and correctly separated by the provided ``delimiter`` (default delimiter
+is whitespace).
+
+.. ipython:: python
+
+   df = pd.read_fwf('bar.csv', header=None, index_col=0)
+   df
+
 .. ipython:: python
    :suppress:
 
diff --git a/doc/source/release.rst b/doc/source/release.rst
index f3f86dec92502..177381346e2d1 100644
--- a/doc/source/release.rst
+++ b/doc/source/release.rst
@@ -59,6 +59,7 @@ New features
   - Added ``isin`` method to DataFrame (:issue:`4211`)
   - Clipboard functionality now works with PySide (:issue:`4282`)
   - New ``extract`` string method returns regex matches more conveniently (:issue:`4685`)
+  - Auto-detect field widths in read_fwf when unspecified (:issue:`4488`)
 
 Experimental Features
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/doc/source/v0.13.0.txt b/doc/source/v0.13.0.txt
index 0796f34ead839..0e3c3b50fcd85 100644
--- a/doc/source/v0.13.0.txt
+++ b/doc/source/v0.13.0.txt
@@ -421,6 +421,9 @@ Enhancements
 
     can also be used.
   - ``read_stata` now accepts Stata 13 format (:issue:`4291`)
+  - ``read_fwf`` now infers the column specifications from the first 100 rows of
+    the file if the data has correctly separated and properly aligned columns
+    using the delimiter provided to the function (:issue:`4488`). 
 
 .. _whatsnew_0130.experimental:
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index e0b12277f4416..3ef3cbf856fef 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -160,11 +160,15 @@
 """ % (_parser_params % _table_sep)
 
 _fwf_widths = """\
-colspecs : a list of pairs (tuples), giving the extents
-    of the fixed-width fields of each line as half-open internals
-    (i.e.,  [from, to[  ).
-widths : a list of field widths, which can be used instead of
-    'colspecs' if the intervals are contiguous.
+colspecs : list of pairs (int, int) or 'infer'. optional
+    A list of pairs (tuples) giving the extents of the fixed-width
+    fields of each line as half-open intervals (i.e.,  [from, to[ ).
+    String value 'infer' can be used to instruct the parser to try
+    detecting the column specifications from the first 100 rows of
+    the data (default='infer').
+widths : list of ints. optional
+    A list of field widths which can be used instead of 'colspecs' if
+    the intervals are contiguous.
 """
 
 _read_fwf_doc = """
@@ -184,7 +188,8 @@ def _read(filepath_or_buffer, kwds):
     if skipfooter is not None:
         kwds['skip_footer'] = skipfooter
 
-    filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer)
+    filepath_or_buffer, _ = get_filepath_or_buffer(filepath_or_buffer,
+                                                   encoding)
 
     if kwds.get('date_parser', None) is not None:
         if isinstance(kwds['parse_dates'], bool):
@@ -267,8 +272,8 @@ def _read(filepath_or_buffer, kwds):
 }
 
 _fwf_defaults = {
-    'colspecs': None,
-    'widths': None
+    'colspecs': 'infer',
+    'widths': None,
 }
 
 _c_unsupported = set(['skip_footer'])
@@ -412,15 +417,15 @@ def parser_f(filepath_or_buffer,
 
 
 @Appender(_read_fwf_doc)
-def read_fwf(filepath_or_buffer, colspecs=None, widths=None, **kwds):
+def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds):
     # Check input arguments.
     if colspecs is None and widths is None:
         raise ValueError("Must specify either colspecs or widths")
-    elif colspecs is not None and widths is not None:
+    elif colspecs not in (None, 'infer') and widths is not None:
         raise ValueError("You must specify only one of 'widths' and "
                          "'colspecs'")
 
-    # Compute 'colspec' from 'widths', if specified.
+    # Compute 'colspecs' from 'widths', if specified.
     if widths is not None:
         colspecs, col = [], 0
         for w in widths:
@@ -519,7 +524,8 @@ def _clean_options(self, options, engine):
                 engine = 'python'
         elif sep is not None and len(sep) > 1:
             # wait until regex engine integrated
-            engine = 'python'
+            if engine not in ('python', 'python-fwf'):
+                engine = 'python'
 
         # C engine not supported yet
         if engine == 'c':
@@ -2012,31 +2018,65 @@ class FixedWidthReader(object):
     """
     A reader of fixed-width lines.
     """
-    def __init__(self, f, colspecs, filler, thousands=None, encoding=None):
+    def __init__(self, f, colspecs, delimiter, comment):
         self.f = f
-        self.colspecs = colspecs
-        self.filler = filler  # Empty characters between fields.
-        self.thousands = thousands
-        if encoding is None:
-            encoding = get_option('display.encoding')
-        self.encoding = encoding
-
-        if not isinstance(colspecs, (tuple, list)):
+        self.buffer = None
+        self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
+        self.comment = comment
+        if colspecs == 'infer':
+            self.colspecs = self.detect_colspecs()
+        else:
+            self.colspecs = colspecs
+
+        if not isinstance(self.colspecs, (tuple, list)):
             raise TypeError("column specifications must be a list or tuple, "
                             "input was a %r" % type(colspecs).__name__)
 
-        for colspec in colspecs:
+        for colspec in self.colspecs:
             if not (isinstance(colspec, (tuple, list)) and
-                    len(colspec) == 2 and
-                    isinstance(colspec[0], int) and
-                    isinstance(colspec[1], int)):
+                       len(colspec) == 2 and
+                       isinstance(colspec[0], (int, np.integer)) and
+                       isinstance(colspec[1], (int, np.integer))):
                 raise TypeError('Each column specification must be '
                                 '2 element tuple or list of integers')
 
+    def get_rows(self, n):
+        rows = []
+        for i, row in enumerate(self.f, 1):
+            rows.append(row)
+            if i >= n:
+                break
+        self.buffer = iter(rows)
+        return rows
+
+    def detect_colspecs(self, n=100):
+        # Regex escape the delimiters
+        delimiters = ''.join([r'\%s' % x for x in self.delimiter])
+        pattern = re.compile('([^%s]+)' % delimiters)
+        rows = self.get_rows(n)
+        max_len = max(map(len, rows))
+        mask = np.zeros(max_len + 1, dtype=int)
+        if self.comment is not None:
+            rows = [row.partition(self.comment)[0] for row in rows]
+        for row in rows:
+            for m in pattern.finditer(row):
+                mask[m.start():m.end()] = 1
+        shifted = np.roll(mask, 1)
+        shifted[0] = 0
+        edges = np.where((mask ^ shifted) == 1)[0]
+        return list(zip(edges[::2], edges[1::2]))
+
     def next(self):
-        line = next(self.f)
+        if self.buffer is not None:
+            try:
+                line = next(self.buffer)
+            except StopIteration:
+                self.buffer = None
+                line = next(self.f)
+        else:
+            line = next(self.f)
         # Note: 'colspecs' is a sequence of half-open intervals.
-        return [line[fromm:to].strip(self.filler or ' ')
+        return [line[fromm:to].strip(self.delimiter)
                 for (fromm, to) in self.colspecs]
 
     # Iterator protocol in Python 3 uses __next__()
@@ -2050,10 +2090,10 @@ class FixedWidthFieldParser(PythonParser):
     """
     def __init__(self, f, **kwds):
         # Support iterators, convert to a list.
-        self.colspecs = list(kwds.pop('colspecs'))
+        self.colspecs = kwds.pop('colspecs')
 
         PythonParser.__init__(self, f, **kwds)
 
     def _make_reader(self, f):
         self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
-                                     encoding=self.encoding)
+                                     self.comment)
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 4e0c00c8a31eb..44e40dc34ff25 100644
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -1706,7 +1706,7 @@ def test_utf16_example(self):
             self.assertEquals(len(result), 50)
 
     def test_converters_corner_with_nas(self):
-      # skip aberration observed on Win64 Python 3.2.2
+        # skip aberration observed on Win64 Python 3.2.2
         if hash(np.int64(-1)) != -2:
             raise nose.SkipTest("skipping because of windows hash on Python"
                                 " 3.2.2")
@@ -2078,19 +2078,19 @@ def test_fwf(self):
             read_fwf(StringIO(data3), colspecs=colspecs, widths=[6, 10, 10, 7])
 
         with tm.assertRaisesRegexp(ValueError, "Must specify either"):
-            read_fwf(StringIO(data3))
+            read_fwf(StringIO(data3), colspecs=None, widths=None)
 
     def test_fwf_colspecs_is_list_or_tuple(self):
         with tm.assertRaisesRegexp(TypeError,
                                    'column specifications must be a list or '
                                    'tuple.+'):
-            fwr = pd.io.parsers.FixedWidthReader(StringIO(self.data1),
-                                                 {'a': 1}, ',')
+            pd.io.parsers.FixedWidthReader(StringIO(self.data1),
+                                           {'a': 1}, ',', '#')
 
     def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples(self):
         with tm.assertRaisesRegexp(TypeError,
                                    'Each column specification must be.+'):
-            read_fwf(StringIO(self.data1), {'a': 1})
+            read_fwf(StringIO(self.data1), [('a', 1)])
 
     def test_fwf_regression(self):
         # GH 3594
@@ -2223,6 +2223,107 @@ def test_iteration_open_handle(self):
                 expected = Series(['DDD', 'EEE', 'FFF', 'GGG'])
                 tm.assert_series_equal(result, expected)
 
+
+class TestFwfColspaceSniffing(unittest.TestCase):
+    def test_full_file(self):
+        # File with all values
+        test = '''index                             A    B    C
+2000-01-03T00:00:00  0.980268513777    3  foo
+2000-01-04T00:00:00  1.04791624281    -4  bar
+2000-01-05T00:00:00  0.498580885705   73  baz
+2000-01-06T00:00:00  1.12020151869     1  foo
+2000-01-07T00:00:00  0.487094399463    0  bar
+2000-01-10T00:00:00  0.836648671666    2  baz
+2000-01-11T00:00:00  0.157160753327   34  foo'''
+        colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
+        expected = read_fwf(StringIO(test), colspecs=colspecs)
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
+
+    def test_full_file_with_missing(self):
+        # File with missing values
+        test = '''index                             A    B    C
+2000-01-03T00:00:00  0.980268513777    3  foo
+2000-01-04T00:00:00  1.04791624281    -4  bar
+                     0.498580885705   73  baz
+2000-01-06T00:00:00  1.12020151869     1  foo
+2000-01-07T00:00:00                    0  bar
+2000-01-10T00:00:00  0.836648671666    2  baz
+                                      34'''
+        colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
+        expected = read_fwf(StringIO(test), colspecs=colspecs)
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
+
+    def test_full_file_with_spaces(self):
+        # File with spaces in columns
+        test = '''
+Account                 Name  Balance     CreditLimit   AccountCreated
+101     Keanu Reeves          9315.45     10000.00           1/17/1998
+312     Gerard Butler         90.00       1000.00             8/6/2003
+868     Jennifer Love Hewitt  0           17000.00           5/25/1985
+761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
+317     Bill Murray           789.65      5000.00             2/5/2007
+'''.strip('\r\n')
+        colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
+        expected = read_fwf(StringIO(test), colspecs=colspecs)
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
+
+    def test_full_file_with_spaces_and_missing(self):
+        # File with spaces and missing values in columsn
+        test = '''
+Account               Name    Balance     CreditLimit   AccountCreated
+101                           10000.00                       1/17/1998
+312     Gerard Butler         90.00       1000.00             8/6/2003
+868                                                          5/25/1985
+761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
+317     Bill Murray           789.65
+'''.strip('\r\n')
+        colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
+        expected = read_fwf(StringIO(test), colspecs=colspecs)
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
+
+    def test_messed_up_data(self):
+        # Completely messed up file
+        test = '''
+   Account          Name             Balance     Credit Limit   Account Created
+       101                           10000.00                       1/17/1998
+       312     Gerard Butler         90.00       1000.00
+
+       761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
+  317          Bill Murray           789.65
+'''.strip('\r\n')
+        colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
+        expected = read_fwf(StringIO(test), colspecs=colspecs)
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
+
+    def test_multiple_delimiters(self):
+        test = r'''
+col1~~~~~col2  col3++++++++++++++++++col4
+~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
+  33+++122.33\\\bar.........Gerard Butler
+++44~~~~12.01   baz~~Jennifer Love Hewitt
+~~55       11+++foo++++Jada Pinkett-Smith
+..66++++++.03~~~bar           Bill Murray
+'''.strip('\r\n')
+        colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
+        expected = read_fwf(StringIO(test), colspecs=colspecs,
+                            delimiter=' +~.\\')
+        tm.assert_frame_equal(expected, read_fwf(StringIO(test),
+                                                 delimiter=' +~.\\'))
+
+    def test_variable_width_unicode(self):
+        if not compat.PY3:
+            raise nose.SkipTest('Bytes-related test - only needs to work on Python 3')
+        test = '''
+שלום שלום
+ום   שלל
+של   ום
+'''.strip('\r\n')
+        expected = pd.read_fwf(BytesIO(test.encode('utf8')),
+                               colspecs=[(0, 4), (5, 9)], header=None)
+        tm.assert_frame_equal(expected, read_fwf(BytesIO(test.encode('utf8')),
+                                                 header=None))
+
+
 class TestCParserHighMemory(ParserTests, unittest.TestCase):
 
     def read_csv(self, *args, **kwds):