pandas-dev · WillAyd · Nov 26, 2018 · Oct 18, 2018 · Oct 18, 2018 · Oct 19, 2018
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -342,14 +342,17 @@
                          _engine_doc))
 
 _fwf_widths = """\
-colspecs : list of pairs (int, int) or 'infer'. optional
+colspecs : list of pairs (int, int) or 'infer', default 'infer'
     A list of pairs (tuples) giving the extents of the fixed-width
-    fields of each line as half-open intervals (i.e.,  [from, to[ ).
+    fields of each line as half-open intervals (i.e.,  [from, to) ).
     String value 'infer' can be used to instruct the parser to try
-    detecting the column specifications from the first 100 rows of
-    the data which are not being skipped via skiprows (default='infer').
-widths : list of ints. optional
-    A list of field widths which can be used instead of 'colspecs' if
+    detecting the column specifications using the ``infer_nrows``
+    number of rows of the data which are not being skipped via skiprows.
+infer_nrows : int, default 100
+    The number of rows to consider when letting the parser determine the
+    ``colspecs``.
+widths : list of ints, optional
+    A list of field widths which can be used instead of ``colspecs`` if
     the intervals are contiguous.
 delimiter : str, default ``'\t' + ' '``
     Characters to consider as filler characters in the fixed-width file.
@@ -527,6 +530,7 @@ def _read(filepath_or_buffer, kwds):
 
 _fwf_defaults = {
     'colspecs': 'infer',
+    'infer_nrows': 100,
     'widths': None,
 }
 
@@ -716,7 +720,8 @@ def parser_f(filepath_or_buffer,
 
 
 @Appender(_read_fwf_doc)
-def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds):
+def read_fwf(filepath_or_buffer, colspecs='infer', infer_nrows=100,
+             widths=None, **kwds):
     # Check input arguments.
     if colspecs is None and widths is None:
         raise ValueError("Must specify either colspecs or widths")
@@ -732,6 +737,7 @@ def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds):
             col += w
 
     kwds['colspecs'] = colspecs
+    kwds['infer_nrows'] = infer_nrows
     kwds['engine'] = 'python-fwf'
     return _read(filepath_or_buffer, kwds)
 
@@ -3361,13 +3367,15 @@ class FixedWidthReader(BaseIterator):
     A reader of fixed-width lines.
     """
 
-    def __init__(self, f, colspecs, delimiter, comment, skiprows=None):
+    def __init__(self, f, colspecs, delimiter, comment, infer_nrows,
+                 skiprows=None):
         self.f = f
         self.buffer = None
         self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
         self.comment = comment
         if colspecs == 'infer':
-            self.colspecs = self.detect_colspecs(skiprows=skiprows)
+            self.colspecs = self.detect_colspecs(infer_nrows=infer_nrows,
+                                                 skiprows=skiprows)
         else:
             self.colspecs = colspecs
 
@@ -3420,11 +3428,11 @@ def get_rows(self, n, skiprows=None):
         self.buffer = iter(buffer_rows)
         return detect_rows
 
-    def detect_colspecs(self, n=100, skiprows=None):
+    def detect_colspecs(self, infer_nrows, skiprows=None):
         # Regex escape the delimiters
         delimiters = ''.join(r'\%s' % x for x in self.delimiter)
         pattern = re.compile('([^%s]+)' % delimiters)
-        rows = self.get_rows(n, skiprows)
+        rows = self.get_rows(infer_nrows, skiprows)
         if not rows:
             raise EmptyDataError("No rows from which to infer column width")
         max_len = max(map(len, rows))
@@ -3463,8 +3471,10 @@ class FixedWidthFieldParser(PythonParser):
     def __init__(self, f, **kwds):
         # Support iterators, convert to a list.
         self.colspecs = kwds.pop('colspecs')
+        self.infer_nrows = kwds.pop('infer_nrows')
         PythonParser.__init__(self, f, **kwds)
 
     def _make_reader(self, f):
         self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
-                                     self.comment, self.skiprows)
+                                     self.comment, self.infer_nrows,
+                                     self.skiprows)
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
@@ -141,6 +141,17 @@ def test_fwf_colspecs_None(self):
         expected = DataFrame([[123456, 456], [456789, 789]])
         tm.assert_frame_equal(result, expected)
 
+    def test_fwf_colspecs_infer_nrows(self):
+        # GH 15138
+        # infer_nrows = 1 should have colspec == [(2, 3), (5, 6)]
+        data = """\
+  1  2
+123 98
+"""
+        df = read_fwf(StringIO(data), header=None, infer_nrows=1)
+        expected = pd.DataFrame([[1, 2], [3, 8]])
+        tm.assert_frame_equal(df, expected)
+
     def test_fwf_regression(self):
         # GH 3594
         # turns out 'T060' is parsable as a datetime slice!