Skip to content

BUG: read_fwf inference should respect skiprows (#11256) #14028

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,7 @@ Bug Fixes
- Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`)
- Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`)
- Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`)
- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`)

- Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`)

Expand Down
61 changes: 45 additions & 16 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@
fields of each line as half-open intervals (i.e., [from, to[ ).
String value 'infer' can be used to instruct the parser to try
detecting the column specifications from the first 100 rows of
the data (default='infer').
the data which are not being skipped via skiprows (default='infer').
widths : list of ints. optional
A list of field widths which can be used instead of 'colspecs' if
the intervals are contiguous.
Expand Down Expand Up @@ -3034,13 +3034,13 @@ class FixedWidthReader(BaseIterator):
A reader of fixed-width lines.
"""

def __init__(self, f, colspecs, delimiter, comment):
def __init__(self, f, colspecs, delimiter, comment, skiprows=None):
self.f = f
self.buffer = None
self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t '
self.comment = comment
if colspecs == 'infer':
self.colspecs = self.detect_colspecs()
self.colspecs = self.detect_colspecs(skiprows=skiprows)
else:
self.colspecs = colspecs

Expand All @@ -3049,28 +3049,57 @@ def __init__(self, f, colspecs, delimiter, comment):
"input was a %r" % type(colspecs).__name__)

for colspec in self.colspecs:

if not (isinstance(colspec, (tuple, list)) and
len(colspec) == 2 and
isinstance(colspec[0], (int, np.integer, type(None))) and
isinstance(colspec[1], (int, np.integer, type(None)))):
raise TypeError('Each column specification must be '
'2 element tuple or list of integers')

def get_rows(self, n):
rows = []
for i, row in enumerate(self.f, 1):
rows.append(row)
if i >= n:
def get_rows(self, n, skiprows=None):
"""
Read rows from self.f, skipping as specified.

We distinguish buffer_rows (the first <= n lines)
from the rows returned to detect_colspecs because
it's simpler to leave the other locations with
skiprows logic alone than to modify them to deal
with the fact we skipped some rows here as well.

Parameters
----------
n : int
Number of rows to read from self.f, not counting
rows that are skipped.
skiprows: set, optional
Indices of rows to skip.

Returns
-------
detect_rows : list of str
A list containing the rows to read.

"""
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

add a Parameters/Returns

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah. This wasn't really meant to be a docstring, just a multiline comment.. but I can add a docstring, I guess. :-)

if skiprows is None:
skiprows = set()
buffer_rows = []
detect_rows = []
for i, row in enumerate(self.f):
if i not in skiprows:
detect_rows.append(row)
buffer_rows.append(row)
if len(detect_rows) >= n:
break
self.buffer = iter(rows)
return rows
self.buffer = iter(buffer_rows)
return detect_rows

def detect_colspecs(self, n=100):
def detect_colspecs(self, n=100, skiprows=None):
# Regex escape the delimiters
delimiters = ''.join([r'\%s' % x for x in self.delimiter])
pattern = re.compile('([^%s]+)' % delimiters)
rows = self.get_rows(n)
rows = self.get_rows(n, skiprows)
if not rows:
raise EmptyDataError("No rows from which to infer column width")
max_len = max(map(len, rows))
mask = np.zeros(max_len + 1, dtype=int)
if self.comment is not None:
Expand All @@ -3081,7 +3110,8 @@ def detect_colspecs(self, n=100):
shifted = np.roll(mask, 1)
shifted[0] = 0
edges = np.where((mask ^ shifted) == 1)[0]
return list(zip(edges[::2], edges[1::2]))
edge_pairs = list(zip(edges[::2], edges[1::2]))
return edge_pairs

def __next__(self):
if self.buffer is not None:
Expand All @@ -3106,9 +3136,8 @@ class FixedWidthFieldParser(PythonParser):
def __init__(self, f, **kwds):
# Support iterators, convert to a list.
self.colspecs = kwds.pop('colspecs')

PythonParser.__init__(self, f, **kwds)

def _make_reader(self, f):
self.data = FixedWidthReader(f, self.colspecs, self.delimiter,
self.comment)
self.comment, self.skiprows)
72 changes: 55 additions & 17 deletions pandas/io/tests/parser/test_read_fwf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from pandas import DataFrame
from pandas import compat
from pandas.compat import StringIO, BytesIO
from pandas.io.parsers import read_csv, read_fwf
from pandas.io.parsers import read_csv, read_fwf, EmptyDataError


class TestFwfParsing(tm.TestCase):
Expand Down Expand Up @@ -248,83 +248,83 @@ def test_bool_header_arg(self):

def test_full_file(self):
# File with all values
test = '''index A B C
test = """index A B C
2000-01-03T00:00:00 0.980268513777 3 foo
2000-01-04T00:00:00 1.04791624281 -4 bar
2000-01-05T00:00:00 0.498580885705 73 baz
2000-01-06T00:00:00 1.12020151869 1 foo
2000-01-07T00:00:00 0.487094399463 0 bar
2000-01-10T00:00:00 0.836648671666 2 baz
2000-01-11T00:00:00 0.157160753327 34 foo'''
2000-01-11T00:00:00 0.157160753327 34 foo"""
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

def test_full_file_with_missing(self):
# File with missing values
test = '''index A B C
test = """index A B C
2000-01-03T00:00:00 0.980268513777 3 foo
2000-01-04T00:00:00 1.04791624281 -4 bar
0.498580885705 73 baz
2000-01-06T00:00:00 1.12020151869 1 foo
2000-01-07T00:00:00 0 bar
2000-01-10T00:00:00 0.836648671666 2 baz
34'''
34"""
colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

def test_full_file_with_spaces(self):
# File with spaces in columns
test = '''
test = """
Account Name Balance CreditLimit AccountCreated
101 Keanu Reeves 9315.45 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00 8/6/2003
868 Jennifer Love Hewitt 0 17000.00 5/25/1985
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65 5000.00 2/5/2007
'''.strip('\r\n')
""".strip('\r\n')
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

def test_full_file_with_spaces_and_missing(self):
# File with spaces and missing values in columsn
test = '''
test = """
Account Name Balance CreditLimit AccountCreated
101 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00 8/6/2003
868 5/25/1985
761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65
'''.strip('\r\n')
""".strip('\r\n')
colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

def test_messed_up_data(self):
# Completely messed up file
test = '''
test = """
Account Name Balance Credit Limit Account Created
101 10000.00 1/17/1998
312 Gerard Butler 90.00 1000.00

761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006
317 Bill Murray 789.65
'''.strip('\r\n')
""".strip('\r\n')
colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
expected = read_fwf(StringIO(test), colspecs=colspecs)
tm.assert_frame_equal(expected, read_fwf(StringIO(test)))

def test_multiple_delimiters(self):
test = r'''
test = r"""
col1~~~~~col2 col3++++++++++++++++++col4
~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves
33+++122.33\\\bar.........Gerard Butler
++44~~~~12.01 baz~~Jennifer Love Hewitt
~~55 11+++foo++++Jada Pinkett-Smith
..66++++++.03~~~bar Bill Murray
'''.strip('\r\n')
""".strip('\r\n')
colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
expected = read_fwf(StringIO(test), colspecs=colspecs,
delimiter=' +~.\\')
Expand All @@ -335,22 +335,22 @@ def test_variable_width_unicode(self):
if not compat.PY3:
raise nose.SkipTest(
'Bytes-related test - only needs to work on Python 3')
test = '''
test = """
שלום שלום
ום שלל
של ום
'''.strip('\r\n')
""".strip('\r\n')
expected = read_fwf(BytesIO(test.encode('utf8')),
colspecs=[(0, 4), (5, 9)],
header=None, encoding='utf8')
tm.assert_frame_equal(expected, read_fwf(
BytesIO(test.encode('utf8')), header=None, encoding='utf8'))

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test that exercises the empty data (e.g. not enough rows I think and you have skiprows set)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Beyond test_skiprows_inference_empty?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does that exercise read_fwf?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it uses self.read_csv it should, but not sure fwf is really all that well tested.

def test_dtype(self):
data = ''' a b c
data = """ a b c
1 2 3.2
3 4 5.2
'''
"""
colspecs = [(0, 5), (5, 10), (10, None)]
result = pd.read_fwf(StringIO(data), colspecs=colspecs)
expected = pd.DataFrame({
Expand All @@ -365,3 +365,41 @@ def test_dtype(self):
result = pd.read_fwf(StringIO(data), colspecs=colspecs,
dtype={'a': 'float64', 'b': str, 'c': 'int32'})
tm.assert_frame_equal(result, expected)

def test_skiprows_inference(self):
# GH11256
test = """
Text contained in the file header

DataCol1 DataCol2
0.0 1.0
101.6 956.1
""".strip()
expected = read_csv(StringIO(test), skiprows=2,
delim_whitespace=True)
tm.assert_frame_equal(expected, read_fwf(
StringIO(test), skiprows=2))

def test_skiprows_by_index_inference(self):
test = """
To be skipped
Not To Be Skipped
Once more to be skipped
123 34 8 123
456 78 9 456
""".strip()

expected = read_csv(StringIO(test), skiprows=[0, 2],
delim_whitespace=True)
tm.assert_frame_equal(expected, read_fwf(
StringIO(test), skiprows=[0, 2]))

def test_skiprows_inference_empty(self):
test = """
AA BBB C
12 345 6
78 901 2
""".strip()

with tm.assertRaises(EmptyDataError):
read_fwf(StringIO(test), skiprows=3)