BUG: read_fwf inference should respect skiprows (pandas-dev#11256)

dsm054 · jreback · commit aa03e7fa07d3 · 2017-01-10T08:16:15.000-05:00
Fix the fact that we don't skip the rows when inferring colspecs by passing skiprows down the chain until it's needed. - [X] closes pandas-dev#11256 - [X] 3 tests added / passed - [X] passes `git diff upstream/master | flake8 --diff` - [X] whatsnew entry Author: D.S. McNeil <dsm054@gmail.com> Closes pandas-dev#14028 from dsm054/bugfix/fwf_skiprows and squashes the following commits: b5b3e66 [D.S. McNeil] BUG: read_fwf inference should respect skiprows (pandas-dev#11256)
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -305,6 +305,7 @@ Bug Fixes
 - Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`)
 - Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`)
 - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
+- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
 
 - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
 
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -323,7 +323,7 @@
     fields of each line as half-open intervals (i.e.,  [from, to[ ).
     String value 'infer' can be used to instruct the parser to try
     detecting the column specifications from the first 100 rows of
-    the data (default='infer').
+    the data which are not being skipped via skiprows (default='infer').
 widths : list of ints. optional
     A list of field widths which can be used instead of 'colspecs' if
     the intervals are contiguous.
@@ -3034,13 +3034,13 @@ class FixedWidthReader(BaseIterator):
     A reader of fixed-width lines.
     """
 
-    def __init__(self, f, colspecs, delimiter, comment):
+    def __init__(self, f, colspecs, delimiter, comment, skiprows=None):
         self.f = f
         self.buffer = None
         self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
         self.comment = comment
         if colspecs == 'infer':
-            self.colspecs = self.detect_colspecs()
+            self.colspecs = self.detect_colspecs(skiprows=skiprows)
         else:
             self.colspecs = colspecs
 
@@ -3049,28 +3049,57 @@ def __init__(self, f, colspecs, delimiter, comment):
                             "input was a %r" % type(colspecs).__name__)
 
         for colspec in self.colspecs:
-
             if not (isinstance(colspec, (tuple, list)) and
                     len(colspec) == 2 and
                     isinstance(colspec[0], (int, np.integer, type(None))) and
                     isinstance(colspec[1], (int, np.integer, type(None)))):
                 raise TypeError('Each column specification must be '
                                 '2 element tuple or list of integers')
 
-    def get_rows(self, n):
-        rows = []
-        for i, row in enumerate(self.f, 1):
-            rows.append(row)
-            if i >= n:
+    def get_rows(self, n, skiprows=None):
+        """
+        Read rows from self.f, skipping as specified.
+
+        We distinguish buffer_rows (the first <= n lines)
+        from the rows returned to detect_colspecs because
+        it's simpler to leave the other locations with
+        skiprows logic alone than to modify them to deal
+        with the fact we skipped some rows here as well.
+
+        Parameters
+        ----------
+        n : int
+            Number of rows to read from self.f, not counting
+            rows that are skipped.
+        skiprows: set, optional
+            Indices of rows to skip.
+
+        Returns
+        -------
+        detect_rows : list of str
+            A list containing the rows to read.
+
+        """
+        if skiprows is None:
+            skiprows = set()
+        buffer_rows = []
+        detect_rows = []
+        for i, row in enumerate(self.f):
+            if i not in skiprows:
+                detect_rows.append(row)
+            buffer_rows.append(row)
+            if len(detect_rows) >= n:
                 break
-        self.buffer = iter(rows)
-        return rows
+        self.buffer = iter(buffer_rows)
+        return detect_rows
 
-    def detect_colspecs(self, n=100):
+    def detect_colspecs(self, n=100, skiprows=None):
         # Regex escape the delimiters
         delimiters = ''.join([r'\%s' % x for x in self.delimiter])
         pattern = re.compile('([^%s]+)' % delimiters)
-        rows = self.get_rows(n)
+        rows = self.get_rows(n, skiprows)
+        if not rows:
+            raise EmptyDataError("No rows from which to infer column width")
         max_len = max(map(len, rows))
         mask = np.zeros(max_len + 1, dtype=int)
         if self.comment is not None:
@@ -3081,7 +3110,8 @@ def detect_colspecs(self, n=100):
         shifted = np.roll(mask, 1)
         shifted[0] = 0
         edges = np.where((mask ^ shifted) == 1)[0]
-        return list(zip(edges[::2], edges[1::2]))
+        edge_pairs = list(zip(edges[::2], edges[1::2]))
+        return edge_pairs
 
     def __next__(self):
         if self.buffer is not None:
@@ -3106,9 +3136,8 @@ class FixedWidthFieldParser(PythonParser):
     def __init__(self, f, **kwds):
         # Support iterators, convert to a list.
         self.colspecs = kwds.pop('colspecs')
-
         PythonParser.__init__(self, f, **kwds)
 
     def _make_reader(self, f):
         self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
-                                     self.comment)
+                                     self.comment, self.skiprows)
diff --git a/pandas/io/tests/parser/test_read_fwf.py b/pandas/io/tests/parser/test_read_fwf.py
@@ -16,7 +16,7 @@
 from pandas import DataFrame
 from pandas import compat
 from pandas.compat import StringIO, BytesIO
-from pandas.io.parsers import read_csv, read_fwf
+from pandas.io.parsers import read_csv, read_fwf, EmptyDataError
 
 
 class TestFwfParsing(tm.TestCase):
@@ -248,83 +248,83 @@ def test_bool_header_arg(self):
 
     def test_full_file(self):
         # File with all values
-        test = '''index                             A    B    C
+        test = """index                             A    B    C
 2000-01-03T00:00:00  0.980268513777    3  foo
 2000-01-04T00:00:00  1.04791624281    -4  bar
 2000-01-05T00:00:00  0.498580885705   73  baz
 2000-01-06T00:00:00  1.12020151869     1  foo
 2000-01-07T00:00:00  0.487094399463    0  bar
 2000-01-10T00:00:00  0.836648671666    2  baz
-2000-01-11T00:00:00  0.157160753327   34  foo'''
+2000-01-11T00:00:00  0.157160753327   34  foo"""
         colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
         expected = read_fwf(StringIO(test), colspecs=colspecs)
         tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
 
     def test_full_file_with_missing(self):
         # File with missing values
-        test = '''index                             A    B    C
+        test = """index                             A    B    C
 2000-01-03T00:00:00  0.980268513777    3  foo
 2000-01-04T00:00:00  1.04791624281    -4  bar
                      0.498580885705   73  baz
 2000-01-06T00:00:00  1.12020151869     1  foo
 2000-01-07T00:00:00                    0  bar
 2000-01-10T00:00:00  0.836648671666    2  baz
-                                      34'''
+                                      34"""
         colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
         expected = read_fwf(StringIO(test), colspecs=colspecs)
         tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
 
     def test_full_file_with_spaces(self):
         # File with spaces in columns
-        test = '''
+        test = """
 Account                 Name  Balance     CreditLimit   AccountCreated
 101     Keanu Reeves          9315.45     10000.00           1/17/1998
 312     Gerard Butler         90.00       1000.00             8/6/2003
 868     Jennifer Love Hewitt  0           17000.00           5/25/1985
 761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
 317     Bill Murray           789.65      5000.00             2/5/2007
-'''.strip('\r\n')
+""".strip('\r\n')
         colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
         expected = read_fwf(StringIO(test), colspecs=colspecs)
         tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
 
     def test_full_file_with_spaces_and_missing(self):
         # File with spaces and missing values in columsn
-        test = '''
+        test = """
 Account               Name    Balance     CreditLimit   AccountCreated
 101                           10000.00                       1/17/1998
 312     Gerard Butler         90.00       1000.00             8/6/2003
 868                                                          5/25/1985
 761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
 317     Bill Murray           789.65
-'''.strip('\r\n')
+""".strip('\r\n')
         colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
         expected = read_fwf(StringIO(test), colspecs=colspecs)
         tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
 
     def test_messed_up_data(self):
         # Completely messed up file
-        test = '''
+        test = """
    Account          Name             Balance     Credit Limit   Account Created
        101                           10000.00                       1/17/1998
        312     Gerard Butler         90.00       1000.00
 
        761     Jada Pinkett-Smith    49654.87    100000.00          12/5/2006
   317          Bill Murray           789.65
-'''.strip('\r\n')
+""".strip('\r\n')
         colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
         expected = read_fwf(StringIO(test), colspecs=colspecs)
         tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
 
     def test_multiple_delimiters(self):
-        test = r'''
+        test = r"""
 col1~~~~~col2  col3++++++++++++++++++col4
 ~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
   33+++122.33\\\bar.........Gerard Butler
 ++44~~~~12.01   baz~~Jennifer Love Hewitt
 ~~55       11+++foo++++Jada Pinkett-Smith
 ..66++++++.03~~~bar           Bill Murray
-'''.strip('\r\n')
+""".strip('\r\n')
         colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
         expected = read_fwf(StringIO(test), colspecs=colspecs,
                             delimiter=' +~.\\')
@@ -335,22 +335,22 @@ def test_variable_width_unicode(self):
         if not compat.PY3:
             raise nose.SkipTest(
                 'Bytes-related test - only needs to work on Python 3')
-        test = '''
+        test = """
 שלום שלום
 ום   שלל
 של   ום
-'''.strip('\r\n')
+""".strip('\r\n')
         expected = read_fwf(BytesIO(test.encode('utf8')),
                             colspecs=[(0, 4), (5, 9)],
                             header=None, encoding='utf8')
         tm.assert_frame_equal(expected, read_fwf(
             BytesIO(test.encode('utf8')), header=None, encoding='utf8'))
 
     def test_dtype(self):
-        data = ''' a    b    c
+        data = """ a    b    c
 1    2    3.2
 3    4    5.2
-'''
+"""
         colspecs = [(0, 5), (5, 10), (10, None)]
         result = pd.read_fwf(StringIO(data), colspecs=colspecs)
         expected = pd.DataFrame({
@@ -365,3 +365,41 @@ def test_dtype(self):
         result = pd.read_fwf(StringIO(data), colspecs=colspecs,
                              dtype={'a': 'float64', 'b': str, 'c': 'int32'})
         tm.assert_frame_equal(result, expected)
+
+    def test_skiprows_inference(self):
+        # GH11256
+        test = """
+Text contained in the file header
+
+DataCol1   DataCol2
+     0.0        1.0
+   101.6      956.1
+""".strip()
+        expected = read_csv(StringIO(test), skiprows=2,
+                            delim_whitespace=True)
+        tm.assert_frame_equal(expected, read_fwf(
+            StringIO(test), skiprows=2))
+
+    def test_skiprows_by_index_inference(self):
+        test = """
+To be skipped
+Not  To  Be  Skipped
+Once more to be skipped
+123  34   8      123
+456  78   9      456
+""".strip()
+
+        expected = read_csv(StringIO(test), skiprows=[0, 2],
+                            delim_whitespace=True)
+        tm.assert_frame_equal(expected, read_fwf(
+            StringIO(test), skiprows=[0, 2]))
+
+    def test_skiprows_inference_empty(self):
+        test = """
+AA   BBB  C
+12   345  6
+78   901  2
+""".strip()
+
+        with tm.assertRaises(EmptyDataError):
+            read_fwf(StringIO(test), skiprows=3)