Skip to content

Commit 11aa4e3

Browse files
committed
BUG: read_fwf inference should respect skiprows (#11256)
1 parent 5d791cc commit 11aa4e3

File tree

3 files changed

+71
-16
lines changed

3 files changed

+71
-16
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -992,6 +992,7 @@ Bug Fixes
992992
- Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`)
993993

994994
- Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`)
995+
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
995996

996997
- Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)
997998
- Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`)

pandas/io/parsers.py

+31-15
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@
311311
fields of each line as half-open intervals (i.e., [from, to[ ).
312312
String value 'infer' can be used to instruct the parser to try
313313
detecting the column specifications from the first 100 rows of
314-
the data (default='infer').
314+
the data which are not being skipped via skiprows (default='infer').
315315
widths : list of ints. optional
316316
A list of field widths which can be used instead of 'colspecs' if
317317
the intervals are contiguous.
@@ -2852,13 +2852,15 @@ class FixedWidthReader(BaseIterator):
28522852
A reader of fixed-width lines.
28532853
"""
28542854

2855-
def __init__(self, f, colspecs, delimiter, comment):
2855+
def __init__(self, f, colspecs, delimiter, comment, skiprows=None):
28562856
self.f = f
28572857
self.buffer = None
28582858
self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
28592859
self.comment = comment
2860+
if skiprows is None:
2861+
skiprows = set()
28602862
if colspecs == 'infer':
2861-
self.colspecs = self.detect_colspecs()
2863+
self.colspecs = self.detect_colspecs(skiprows=skiprows)
28622864
else:
28632865
self.colspecs = colspecs
28642866

@@ -2875,20 +2877,34 @@ def __init__(self, f, colspecs, delimiter, comment):
28752877
raise TypeError('Each column specification must be '
28762878
'2 element tuple or list of integers')
28772879

2878-
def get_rows(self, n):
2879-
rows = []
2880-
for i, row in enumerate(self.f, 1):
2881-
rows.append(row)
2882-
if i >= n:
2880+
def get_rows(self, n, skiprows=None):
2881+
"""
2882+
We distinguish buffer_rows (the first <= n lines)
2883+
from the rows returned to detect_colspecs because
2884+
it's simpler to leave the other locations with
2885+
skiprows logic alone than to modify them to deal
2886+
with the fact we skipped some rows here as well.
2887+
"""
2888+
if skiprows is None:
2889+
skiprows = set()
2890+
buffer_rows = []
2891+
detect_rows = []
2892+
for i, row in enumerate(self.f):
2893+
if i not in skiprows:
2894+
detect_rows.append(row)
2895+
buffer_rows.append(row)
2896+
if len(detect_rows) >= n:
28832897
break
2884-
self.buffer = iter(rows)
2885-
return rows
2898+
self.buffer = iter(buffer_rows)
2899+
return detect_rows
28862900

2887-
def detect_colspecs(self, n=100):
2901+
def detect_colspecs(self, n=100, skiprows=None):
28882902
# Regex escape the delimiters
28892903
delimiters = ''.join([r'\%s' % x for x in self.delimiter])
28902904
pattern = re.compile('([^%s]+)' % delimiters)
2891-
rows = self.get_rows(n)
2905+
rows = self.get_rows(n, skiprows)
2906+
if not rows:
2907+
raise EmptyDataError("No rows from which to infer column width")
28922908
max_len = max(map(len, rows))
28932909
mask = np.zeros(max_len + 1, dtype=int)
28942910
if self.comment is not None:
@@ -2899,7 +2915,8 @@ def detect_colspecs(self, n=100):
28992915
shifted = np.roll(mask, 1)
29002916
shifted[0] = 0
29012917
edges = np.where((mask ^ shifted) == 1)[0]
2902-
return list(zip(edges[::2], edges[1::2]))
2918+
edge_pairs = list(zip(edges[::2], edges[1::2]))
2919+
return edge_pairs
29032920

29042921
def __next__(self):
29052922
if self.buffer is not None:
@@ -2924,9 +2941,8 @@ class FixedWidthFieldParser(PythonParser):
29242941
def __init__(self, f, **kwds):
29252942
# Support iterators, convert to a list.
29262943
self.colspecs = kwds.pop('colspecs')
2927-
29282944
PythonParser.__init__(self, f, **kwds)
29292945

29302946
def _make_reader(self, f):
29312947
self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
2932-
self.comment)
2948+
self.comment, self.skiprows)

pandas/io/tests/parser/test_read_fwf.py

+39-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from pandas import DataFrame
1717
from pandas import compat
1818
from pandas.compat import StringIO, BytesIO
19-
from pandas.io.parsers import read_csv, read_fwf
19+
from pandas.io.parsers import read_csv, read_fwf, EmptyDataError
2020

2121

2222
class TestFwfParsing(tm.TestCase):
@@ -345,3 +345,41 @@ def test_variable_width_unicode(self):
345345
header=None, encoding='utf8')
346346
tm.assert_frame_equal(expected, read_fwf(
347347
BytesIO(test.encode('utf8')), header=None, encoding='utf8'))
348+
349+
def test_skiprows_inference(self):
350+
# GH11256
351+
test = '''
352+
Text contained in the file header
353+
354+
DataCol1 DataCol2
355+
0.0 1.0
356+
101.6 956.1
357+
'''.strip()
358+
expected = read_csv(StringIO(test), skiprows=2,
359+
delim_whitespace=True)
360+
tm.assert_frame_equal(expected, read_fwf(
361+
StringIO(test), skiprows=2))
362+
363+
def test_skiprows_by_index_inference(self):
364+
test = '''
365+
To be skipped
366+
Not To Be Skipped
367+
Once more to be skipped
368+
123 34 8 123
369+
456 78 9 456
370+
'''.strip()
371+
372+
expected = read_csv(StringIO(test), skiprows=[0, 2],
373+
delim_whitespace=True)
374+
tm.assert_frame_equal(expected, read_fwf(
375+
StringIO(test), skiprows=[0, 2]))
376+
377+
def test_skiprows_inference_empty(self):
378+
test = '''
379+
AA BBB C
380+
12 345 6
381+
78 901 2
382+
'''.strip()
383+
384+
with tm.assertRaises(EmptyDataError):
385+
read_fwf(StringIO(test), skiprows=3)

0 commit comments

Comments
 (0)