BUG: read_fwf inference should respect skiprows (#11256)

dsm054 · dsm054 · commit 11aa4e35ae22 · 2016-08-17T23:42:01.000-04:00
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -992,6 +992,7 @@ Bug Fixes
 - Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`)
 
 - Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`)
+- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
 
 - Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)
 - Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -311,7 +311,7 @@
     fields of each line as half-open intervals (i.e.,  [from, to[ ).
     String value 'infer' can be used to instruct the parser to try
     detecting the column specifications from the first 100 rows of
-    the data (default='infer').
+    the data which are not being skipped via skiprows (default='infer').
 widths : list of ints. optional
     A list of field widths which can be used instead of 'colspecs' if
     the intervals are contiguous.
@@ -2852,13 +2852,15 @@ class FixedWidthReader(BaseIterator):
     A reader of fixed-width lines.
     """
 
-    def __init__(self, f, colspecs, delimiter, comment):
+    def __init__(self, f, colspecs, delimiter, comment, skiprows=None):
         self.f = f
         self.buffer = None
         self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
         self.comment = comment
+        if skiprows is None:
+            skiprows = set()
         if colspecs == 'infer':
-            self.colspecs = self.detect_colspecs()
+            self.colspecs = self.detect_colspecs(skiprows=skiprows)
         else:
             self.colspecs = colspecs
 
@@ -2875,20 +2877,34 @@ def __init__(self, f, colspecs, delimiter, comment):
                 raise TypeError('Each column specification must be '
                                 '2 element tuple or list of integers')
 
-    def get_rows(self, n):
-        rows = []
-        for i, row in enumerate(self.f, 1):
-            rows.append(row)
-            if i >= n:
+    def get_rows(self, n, skiprows=None):
+        """
+        We distinguish buffer_rows (the first <= n lines)
+        from the rows returned to detect_colspecs because
+        it's simpler to leave the other locations with
+        skiprows logic alone than to modify them to deal
+        with the fact we skipped some rows here as well.
+        """
+        if skiprows is None:
+            skiprows = set()
+        buffer_rows = []
+        detect_rows = []
+        for i, row in enumerate(self.f):
+            if i not in skiprows:
+                detect_rows.append(row)
+            buffer_rows.append(row)
+            if len(detect_rows) >= n:
                 break
-        self.buffer = iter(rows)
-        return rows
+        self.buffer = iter(buffer_rows)
+        return detect_rows
 
-    def detect_colspecs(self, n=100):
+    def detect_colspecs(self, n=100, skiprows=None):
         # Regex escape the delimiters
         delimiters = ''.join([r'\%s' % x for x in self.delimiter])
         pattern = re.compile('([^%s]+)' % delimiters)
-        rows = self.get_rows(n)
+        rows = self.get_rows(n, skiprows)
+        if not rows:
+            raise EmptyDataError("No rows from which to infer column width")
         max_len = max(map(len, rows))
         mask = np.zeros(max_len + 1, dtype=int)
         if self.comment is not None:
@@ -2899,7 +2915,8 @@ def detect_colspecs(self, n=100):
         shifted = np.roll(mask, 1)
         shifted[0] = 0
         edges = np.where((mask ^ shifted) == 1)[0]
-        return list(zip(edges[::2], edges[1::2]))
+        edge_pairs = list(zip(edges[::2], edges[1::2]))
+        return edge_pairs
 
     def __next__(self):
         if self.buffer is not None:
@@ -2924,9 +2941,8 @@ class FixedWidthFieldParser(PythonParser):
     def __init__(self, f, **kwds):
         # Support iterators, convert to a list.
         self.colspecs = kwds.pop('colspecs')
-
         PythonParser.__init__(self, f, **kwds)
 
     def _make_reader(self, f):
         self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
-                                     self.comment)
+                                     self.comment, self.skiprows)
diff --git a/pandas/io/tests/parser/test_read_fwf.py b/pandas/io/tests/parser/test_read_fwf.py
@@ -16,7 +16,7 @@
 from pandas import DataFrame
 from pandas import compat
 from pandas.compat import StringIO, BytesIO
-from pandas.io.parsers import read_csv, read_fwf
+from pandas.io.parsers import read_csv, read_fwf, EmptyDataError
 
 
 class TestFwfParsing(tm.TestCase):
@@ -345,3 +345,41 @@ def test_variable_width_unicode(self):
                             header=None, encoding='utf8')
         tm.assert_frame_equal(expected, read_fwf(
             BytesIO(test.encode('utf8')), header=None, encoding='utf8'))
+
+    def test_skiprows_inference(self):
+        # GH11256
+        test = '''
+Text contained in the file header
+
+DataCol1   DataCol2
+     0.0        1.0
+   101.6      956.1
+'''.strip()
+        expected = read_csv(StringIO(test), skiprows=2,
+                            delim_whitespace=True)
+        tm.assert_frame_equal(expected, read_fwf(
+            StringIO(test), skiprows=2))
+
+    def test_skiprows_by_index_inference(self):
+        test = '''
+To be skipped
+Not  To  Be  Skipped
+Once more to be skipped
+123  34   8      123
+456  78   9      456
+'''.strip()
+
+        expected = read_csv(StringIO(test), skiprows=[0, 2],
+                            delim_whitespace=True)
+        tm.assert_frame_equal(expected, read_fwf(
+            StringIO(test), skiprows=[0, 2]))
+
+    def test_skiprows_inference_empty(self):
+        test = '''
+AA   BBB  C
+12   345  6
+78   901  2
+'''.strip()
+
+        with tm.assertRaises(EmptyDataError):
+            read_fwf(StringIO(test), skiprows=3)