Skip to content

Commit aa03e7f

Browse files
dsm054jreback
authored andcommitted
BUG: read_fwf inference should respect skiprows (pandas-dev#11256)
Fix the fact that we don't skip the rows when inferring colspecs by passing skiprows down the chain until it's needed. - [X] closes pandas-dev#11256 - [X] 3 tests added / passed - [X] passes `git diff upstream/master | flake8 --diff` - [X] whatsnew entry Author: D.S. McNeil <[email protected]> Closes pandas-dev#14028 from dsm054/bugfix/fwf_skiprows and squashes the following commits: b5b3e66 [D.S. McNeil] BUG: read_fwf inference should respect skiprows (pandas-dev#11256)
1 parent e1a4144 commit aa03e7f

File tree

3 files changed

+101
-33
lines changed

3 files changed

+101
-33
lines changed

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ Bug Fixes
305305
- Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`)
306306
- Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`)
307307
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
308+
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)
308309

309310
- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)
310311

pandas/io/parsers.py

+45-16
Original file line numberDiff line numberDiff line change
@@ -323,7 +323,7 @@
323323
fields of each line as half-open intervals (i.e., [from, to[ ).
324324
String value 'infer' can be used to instruct the parser to try
325325
detecting the column specifications from the first 100 rows of
326-
the data (default='infer').
326+
the data which are not being skipped via skiprows (default='infer').
327327
widths : list of ints. optional
328328
A list of field widths which can be used instead of 'colspecs' if
329329
the intervals are contiguous.
@@ -3034,13 +3034,13 @@ class FixedWidthReader(BaseIterator):
30343034
A reader of fixed-width lines.
30353035
"""
30363036

3037-
def __init__(self, f, colspecs, delimiter, comment):
3037+
def __init__(self, f, colspecs, delimiter, comment, skiprows=None):
30383038
self.f = f
30393039
self.buffer = None
30403040
self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
30413041
self.comment = comment
30423042
if colspecs == 'infer':
3043-
self.colspecs = self.detect_colspecs()
3043+
self.colspecs = self.detect_colspecs(skiprows=skiprows)
30443044
else:
30453045
self.colspecs = colspecs
30463046

@@ -3049,28 +3049,57 @@ def __init__(self, f, colspecs, delimiter, comment):
30493049
"input was a %r" % type(colspecs).__name__)
30503050

30513051
for colspec in self.colspecs:
3052-
30533052
if not (isinstance(colspec, (tuple, list)) and
30543053
len(colspec) == 2 and
30553054
isinstance(colspec[0], (int, np.integer, type(None))) and
30563055
isinstance(colspec[1], (int, np.integer, type(None)))):
30573056
raise TypeError('Each column specification must be '
30583057
'2 element tuple or list of integers')
30593058

3060-
def get_rows(self, n):
3061-
rows = []
3062-
for i, row in enumerate(self.f, 1):
3063-
rows.append(row)
3064-
if i >= n:
3059+
def get_rows(self, n, skiprows=None):
3060+
"""
3061+
Read rows from self.f, skipping as specified.
3062+
3063+
We distinguish buffer_rows (the first <= n lines)
3064+
from the rows returned to detect_colspecs because
3065+
it's simpler to leave the other locations with
3066+
skiprows logic alone than to modify them to deal
3067+
with the fact we skipped some rows here as well.
3068+
3069+
Parameters
3070+
----------
3071+
n : int
3072+
Number of rows to read from self.f, not counting
3073+
rows that are skipped.
3074+
skiprows: set, optional
3075+
Indices of rows to skip.
3076+
3077+
Returns
3078+
-------
3079+
detect_rows : list of str
3080+
A list containing the rows to read.
3081+
3082+
"""
3083+
if skiprows is None:
3084+
skiprows = set()
3085+
buffer_rows = []
3086+
detect_rows = []
3087+
for i, row in enumerate(self.f):
3088+
if i not in skiprows:
3089+
detect_rows.append(row)
3090+
buffer_rows.append(row)
3091+
if len(detect_rows) >= n:
30653092
break
3066-
self.buffer = iter(rows)
3067-
return rows
3093+
self.buffer = iter(buffer_rows)
3094+
return detect_rows
30683095

3069-
def detect_colspecs(self, n=100):
3096+
def detect_colspecs(self, n=100, skiprows=None):
30703097
# Regex escape the delimiters
30713098
delimiters = ''.join([r'\%s' % x for x in self.delimiter])
30723099
pattern = re.compile('([^%s]+)' % delimiters)
3073-
rows = self.get_rows(n)
3100+
rows = self.get_rows(n, skiprows)
3101+
if not rows:
3102+
raise EmptyDataError("No rows from which to infer column width")
30743103
max_len = max(map(len, rows))
30753104
mask = np.zeros(max_len + 1, dtype=int)
30763105
if self.comment is not None:
@@ -3081,7 +3110,8 @@ def detect_colspecs(self, n=100):
30813110
shifted = np.roll(mask, 1)
30823111
shifted[0] = 0
30833112
edges = np.where((mask ^ shifted) == 1)[0]
3084-
return list(zip(edges[::2], edges[1::2]))
3113+
edge_pairs = list(zip(edges[::2], edges[1::2]))
3114+
return edge_pairs
30853115

30863116
def __next__(self):
30873117
if self.buffer is not None:
@@ -3106,9 +3136,8 @@ class FixedWidthFieldParser(PythonParser):
31063136
def __init__(self, f, **kwds):
31073137
# Support iterators, convert to a list.
31083138
self.colspecs = kwds.pop('colspecs')
3109-
31103139
PythonParser.__init__(self, f, **kwds)
31113140

31123141
def _make_reader(self, f):
31133142
self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
3114-
self.comment)
3143+
self.comment, self.skiprows)

pandas/io/tests/parser/test_read_fwf.py

+55-17
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
from pandas import DataFrame
1717
from pandas import compat
1818
from pandas.compat import StringIO, BytesIO
19-
from pandas.io.parsers import read_csv, read_fwf
19+
from pandas.io.parsers import read_csv, read_fwf, EmptyDataError
2020

2121

2222
class TestFwfParsing(tm.TestCase):
@@ -248,83 +248,83 @@ def test_bool_header_arg(self):
248248

249249
def test_full_file(self):
250250
# File with all values
251-
test = '''index A B C
251+
test = """index A B C
252252
2000-01-03T00:00:00 0.980268513777 3 foo
253253
2000-01-04T00:00:00 1.04791624281 -4 bar
254254
2000-01-05T00:00:00 0.498580885705 73 baz
255255
2000-01-06T00:00:00 1.12020151869 1 foo
256256
2000-01-07T00:00:00 0.487094399463 0 bar
257257
2000-01-10T00:00:00 0.836648671666 2 baz
258-
2000-01-11T00:00:00 0.157160753327 34 foo'''
258+
2000-01-11T00:00:00 0.157160753327 34 foo"""
259259
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
260260
expected = read_fwf(StringIO(test), colspecs=colspecs)
261261
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
262262

263263
def test_full_file_with_missing(self):
264264
# File with missing values
265-
test = '''index A B C
265+
test = """index A B C
266266
2000-01-03T00:00:00 0.980268513777 3 foo
267267
2000-01-04T00:00:00 1.04791624281 -4 bar
268268
0.498580885705 73 baz
269269
2000-01-06T00:00:00 1.12020151869 1 foo
270270
2000-01-07T00:00:00 0 bar
271271
2000-01-10T00:00:00 0.836648671666 2 baz
272-
34'''
272+
34"""
273273
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
274274
expected = read_fwf(StringIO(test), colspecs=colspecs)
275275
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
276276

277277
def test_full_file_with_spaces(self):
278278
# File with spaces in columns
279-
test = '''
279+
test = """
280280
Account Name Balance CreditLimit AccountCreated
281281
101 Keanu Reeves 9315.45 10000.00 1/17/1998
282282
312 Gerard Butler 90.00 1000.00 8/6/2003
283283
868 Jennifer Love Hewitt 0 17000.00 5/25/1985
284284
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
285285
317 Bill Murray 789.65 5000.00 2/5/2007
286-
'''.strip('\r\n')
286+
""".strip('\r\n')
287287
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
288288
expected = read_fwf(StringIO(test), colspecs=colspecs)
289289
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
290290

291291
def test_full_file_with_spaces_and_missing(self):
292292
# File with spaces and missing values in columsn
293-
test = '''
293+
test = """
294294
Account Name Balance CreditLimit AccountCreated
295295
101 10000.00 1/17/1998
296296
312 Gerard Butler 90.00 1000.00 8/6/2003
297297
868 5/25/1985
298298
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
299299
317 Bill Murray 789.65
300-
'''.strip('\r\n')
300+
""".strip('\r\n')
301301
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
302302
expected = read_fwf(StringIO(test), colspecs=colspecs)
303303
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
304304

305305
def test_messed_up_data(self):
306306
# Completely messed up file
307-
test = '''
307+
test = """
308308
Account Name Balance Credit Limit Account Created
309309
101 10000.00 1/17/1998
310310
312 Gerard Butler 90.00 1000.00
311311
312312
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
313313
317 Bill Murray 789.65
314-
'''.strip('\r\n')
314+
""".strip('\r\n')
315315
colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
316316
expected = read_fwf(StringIO(test), colspecs=colspecs)
317317
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))
318318

319319
def test_multiple_delimiters(self):
320-
test = r'''
320+
test = r"""
321321
col1~~~~~col2 col3++++++++++++++++++col4
322322
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
323323
33+++122.33\\\bar.........Gerard Butler
324324
++44~~~~12.01 baz~~Jennifer Love Hewitt
325325
~~55 11+++foo++++Jada Pinkett-Smith
326326
..66++++++.03~~~bar Bill Murray
327-
'''.strip('\r\n')
327+
""".strip('\r\n')
328328
colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
329329
expected = read_fwf(StringIO(test), colspecs=colspecs,
330330
delimiter=' +~.\\')
@@ -335,22 +335,22 @@ def test_variable_width_unicode(self):
335335
if not compat.PY3:
336336
raise nose.SkipTest(
337337
'Bytes-related test - only needs to work on Python 3')
338-
test = '''
338+
test = """
339339
שלום שלום
340340
ום שלל
341341
של ום
342-
'''.strip('\r\n')
342+
""".strip('\r\n')
343343
expected = read_fwf(BytesIO(test.encode('utf8')),
344344
colspecs=[(0, 4), (5, 9)],
345345
header=None, encoding='utf8')
346346
tm.assert_frame_equal(expected, read_fwf(
347347
BytesIO(test.encode('utf8')), header=None, encoding='utf8'))
348348

349349
def test_dtype(self):
350-
data = ''' a b c
350+
data = """ a b c
351351
1 2 3.2
352352
3 4 5.2
353-
'''
353+
"""
354354
colspecs = [(0, 5), (5, 10), (10, None)]
355355
result = pd.read_fwf(StringIO(data), colspecs=colspecs)
356356
expected = pd.DataFrame({
@@ -365,3 +365,41 @@ def test_dtype(self):
365365
result = pd.read_fwf(StringIO(data), colspecs=colspecs,
366366
dtype={'a': 'float64', 'b': str, 'c': 'int32'})
367367
tm.assert_frame_equal(result, expected)
368+
369+
def test_skiprows_inference(self):
370+
# GH11256
371+
test = """
372+
Text contained in the file header
373+
374+
DataCol1 DataCol2
375+
0.0 1.0
376+
101.6 956.1
377+
""".strip()
378+
expected = read_csv(StringIO(test), skiprows=2,
379+
delim_whitespace=True)
380+
tm.assert_frame_equal(expected, read_fwf(
381+
StringIO(test), skiprows=2))
382+
383+
def test_skiprows_by_index_inference(self):
384+
test = """
385+
To be skipped
386+
Not To Be Skipped
387+
Once more to be skipped
388+
123 34 8 123
389+
456 78 9 456
390+
""".strip()
391+
392+
expected = read_csv(StringIO(test), skiprows=[0, 2],
393+
delim_whitespace=True)
394+
tm.assert_frame_equal(expected, read_fwf(
395+
StringIO(test), skiprows=[0, 2]))
396+
397+
def test_skiprows_inference_empty(self):
398+
test = """
399+
AA BBB C
400+
12 345 6
401+
78 901 2
402+
""".strip()
403+
404+
with tm.assertRaises(EmptyDataError):
405+
read_fwf(StringIO(test), skiprows=3)

0 commit comments

Comments
 (0)