Skip to content

Commit 1f02bf2

Browse files
rdmontgomeryWillAyd
authored andcommitted
ENH/TST/DOC: set infer_nrows for read_fwf (GH15138) (#23238)
1 parent 209e7f5 commit 1f02bf2

File tree

3 files changed

+44
-15
lines changed

3 files changed

+44
-15
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ Other Enhancements
286286
- New attribute :attr:`__git_version__` will return git commit sha of current build (:issue:`21295`).
287287
- Compatibility with Matplotlib 3.0 (:issue:`22790`).
288288
- Added :meth:`Interval.overlaps`, :meth:`IntervalArray.overlaps`, and :meth:`IntervalIndex.overlaps` for determining overlaps between interval-like objects (:issue:`21998`)
289+
- :func:`read_fwf` now accepts keyword `infer_nrows` (:issue:`15138`).
289290
- :func:`~DataFrame.to_parquet` now supports writing a ``DataFrame`` as a directory of parquet files partitioned by a subset of the columns when ``engine = 'pyarrow'`` (:issue:`23283`)
290291
- :meth:`Timestamp.tz_localize`, :meth:`DatetimeIndex.tz_localize`, and :meth:`Series.tz_localize` have gained the ``nonexistent`` argument for alternative handling of nonexistent times. See :ref:`timeseries.timezone_nonexistent` (:issue:`8917`)
291292
- :meth:`Index.difference` now has an optional ``sort`` parameter to specify whether the results should be sorted if possible (:issue:`17839`)

pandas/io/parsers.py

+27-15
Original file line numberDiff line numberDiff line change
@@ -501,6 +501,7 @@ def _read(filepath_or_buffer, kwds):
501501

502502
_fwf_defaults = {
503503
'colspecs': 'infer',
504+
'infer_nrows': 100,
504505
'widths': None,
505506
}
506507

@@ -718,8 +719,8 @@ def parser_f(filepath_or_buffer,
718719
)(read_table)
719720

720721

721-
def read_fwf(filepath_or_buffer, colspecs='infer',
722-
widths=None, **kwds):
722+
def read_fwf(filepath_or_buffer, colspecs='infer', widths=None,
723+
infer_nrows=100, **kwds):
723724

724725
r"""
725726
Read a table of fixed-width formatted lines into DataFrame.
@@ -752,6 +753,11 @@ def read_fwf(filepath_or_buffer, colspecs='infer',
752753
widths : list of int, optional
753754
A list of field widths which can be used instead of 'colspecs' if
754755
the intervals are contiguous.
756+
infer_nrows : int, default 100
757+
The number of rows to consider when letting the parser determine the
758+
`colspecs`.
759+
760+
.. versionadded:: 0.24.0
755761
**kwds : optional
756762
Optional keyword arguments can be passed to ``TextFileReader``.
757763
@@ -786,6 +792,7 @@ def read_fwf(filepath_or_buffer, colspecs='infer',
786792
col += w
787793

788794
kwds['colspecs'] = colspecs
795+
kwds['infer_nrows'] = infer_nrows
789796
kwds['engine'] = 'python-fwf'
790797
return _read(filepath_or_buffer, kwds)
791798

@@ -3442,13 +3449,15 @@ class FixedWidthReader(BaseIterator):
34423449
A reader of fixed-width lines.
34433450
"""
34443451

3445-
def __init__(self, f, colspecs, delimiter, comment, skiprows=None):
3452+
def __init__(self, f, colspecs, delimiter, comment, skiprows=None,
3453+
infer_nrows=100):
34463454
self.f = f
34473455
self.buffer = None
34483456
self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
34493457
self.comment = comment
34503458
if colspecs == 'infer':
3451-
self.colspecs = self.detect_colspecs(skiprows=skiprows)
3459+
self.colspecs = self.detect_colspecs(infer_nrows=infer_nrows,
3460+
skiprows=skiprows)
34523461
else:
34533462
self.colspecs = colspecs
34543463

@@ -3464,19 +3473,20 @@ def __init__(self, f, colspecs, delimiter, comment, skiprows=None):
34643473
raise TypeError('Each column specification must be '
34653474
'2 element tuple or list of integers')
34663475

3467-
def get_rows(self, n, skiprows=None):
3476+
def get_rows(self, infer_nrows, skiprows=None):
34683477
"""
34693478
Read rows from self.f, skipping as specified.
34703479
3471-
We distinguish buffer_rows (the first <= n lines)
3472-
from the rows returned to detect_colspecs because
3473-
it's simpler to leave the other locations with
3474-
skiprows logic alone than to modify them to deal
3475-
with the fact we skipped some rows here as well.
3480+
We distinguish buffer_rows (the first <= infer_nrows
3481+
lines) from the rows returned to detect_colspecs
3482+
because it's simpler to leave the other locations
3483+
with skiprows logic alone than to modify them to
3484+
deal with the fact we skipped some rows here as
3485+
well.
34763486
34773487
Parameters
34783488
----------
3479-
n : int
3489+
infer_nrows : int
34803490
Number of rows to read from self.f, not counting
34813491
rows that are skipped.
34823492
skiprows: set, optional
@@ -3496,16 +3506,16 @@ def get_rows(self, n, skiprows=None):
34963506
if i not in skiprows:
34973507
detect_rows.append(row)
34983508
buffer_rows.append(row)
3499-
if len(detect_rows) >= n:
3509+
if len(detect_rows) >= infer_nrows:
35003510
break
35013511
self.buffer = iter(buffer_rows)
35023512
return detect_rows
35033513

3504-
def detect_colspecs(self, n=100, skiprows=None):
3514+
def detect_colspecs(self, infer_nrows=100, skiprows=None):
35053515
# Regex escape the delimiters
35063516
delimiters = ''.join(r'\%s' % x for x in self.delimiter)
35073517
pattern = re.compile('([^%s]+)' % delimiters)
3508-
rows = self.get_rows(n, skiprows)
3518+
rows = self.get_rows(infer_nrows, skiprows)
35093519
if not rows:
35103520
raise EmptyDataError("No rows from which to infer column width")
35113521
max_len = max(map(len, rows))
@@ -3544,8 +3554,10 @@ class FixedWidthFieldParser(PythonParser):
35443554
def __init__(self, f, **kwds):
35453555
# Support iterators, convert to a list.
35463556
self.colspecs = kwds.pop('colspecs')
3557+
self.infer_nrows = kwds.pop('infer_nrows')
35473558
PythonParser.__init__(self, f, **kwds)
35483559

35493560
def _make_reader(self, f):
35503561
self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
3551-
self.comment, self.skiprows)
3562+
self.comment, self.skiprows,
3563+
self.infer_nrows)

pandas/tests/io/parser/test_read_fwf.py

+16
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,22 @@ def test_fwf_colspecs_None(self):
140140
expected = DataFrame([[123456, 456], [456789, 789]])
141141
tm.assert_frame_equal(result, expected)
142142

143+
def test_fwf_colspecs_infer_nrows(self):
144+
# GH 15138
145+
data = """\
146+
1 2
147+
123 98
148+
"""
149+
# infer_nrows == 1 should have colspec == [(2, 3), (5, 6)]
150+
df = read_fwf(StringIO(data), header=None, infer_nrows=1)
151+
expected = pd.DataFrame([[1, 2], [3, 8]])
152+
tm.assert_frame_equal(df, expected)
153+
154+
# test for infer_nrows > number of rows
155+
df = read_fwf(StringIO(data), header=None, infer_nrows=10)
156+
expected = pd.DataFrame([[1, 2], [123, 98]])
157+
tm.assert_frame_equal(df, expected)
158+
143159
def test_fwf_regression(self):
144160
# GH 3594
145161
# turns out 'T060' is parsable as a datetime slice!

0 commit comments

Comments
 (0)