diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index da13f724eb663..c9ea7b427b3f2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -305,6 +305,7 @@ Bug Fixes - Bug in ``DataFrame.reindex()`` in which ``method`` was ignored when passing ``columns`` (:issue:`14992`) - Bug in ``pd.to_numeric()`` in which float and unsigned integer elements were being improperly casted (:issue:`14941`, :issue:`15005`) - Bug in ``pd.read_csv()`` in which the ``dialect`` parameter was not being verified before processing (:issue:`14898`) +- Bug in ``pd.read_fwf`` where the skiprows parameter was not being respected during column width inference (:issue:`11256`) - Bug in ``DataFrame.loc`` with indexing a ``MultiIndex`` with a ``Series`` indexer (:issue:`14730`) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8a9873b240602..41f1ab6fc16fb 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -323,7 +323,7 @@ fields of each line as half-open intervals (i.e., [from, to[ ). String value 'infer' can be used to instruct the parser to try detecting the column specifications from the first 100 rows of - the data (default='infer'). + the data which are not being skipped via skiprows (default='infer'). widths : list of ints. optional A list of field widths which can be used instead of 'colspecs' if the intervals are contiguous. @@ -3034,13 +3034,13 @@ class FixedWidthReader(BaseIterator): A reader of fixed-width lines. """ - def __init__(self, f, colspecs, delimiter, comment): + def __init__(self, f, colspecs, delimiter, comment, skiprows=None): self.f = f self.buffer = None self.delimiter = '\r\n' + delimiter if delimiter else '\n\r\t ' self.comment = comment if colspecs == 'infer': - self.colspecs = self.detect_colspecs() + self.colspecs = self.detect_colspecs(skiprows=skiprows) else: self.colspecs = colspecs @@ -3049,7 +3049,6 @@ def __init__(self, f, colspecs, delimiter, comment): "input was a %r" % type(colspecs).__name__) for colspec in self.colspecs: - if not (isinstance(colspec, (tuple, list)) and len(colspec) == 2 and isinstance(colspec[0], (int, np.integer, type(None))) and @@ -3057,20 +3056,50 @@ def __init__(self, f, colspecs, delimiter, comment): raise TypeError('Each column specification must be ' '2 element tuple or list of integers') - def get_rows(self, n): - rows = [] - for i, row in enumerate(self.f, 1): - rows.append(row) - if i >= n: + def get_rows(self, n, skiprows=None): + """ + Read rows from self.f, skipping as specified. + + We distinguish buffer_rows (the first <= n lines) + from the rows returned to detect_colspecs because + it's simpler to leave the other locations with + skiprows logic alone than to modify them to deal + with the fact we skipped some rows here as well. + + Parameters + ---------- + n : int + Number of rows to read from self.f, not counting + rows that are skipped. + skiprows: set, optional + Indices of rows to skip. + + Returns + ------- + detect_rows : list of str + A list containing the rows to read. + + """ + if skiprows is None: + skiprows = set() + buffer_rows = [] + detect_rows = [] + for i, row in enumerate(self.f): + if i not in skiprows: + detect_rows.append(row) + buffer_rows.append(row) + if len(detect_rows) >= n: break - self.buffer = iter(rows) - return rows + self.buffer = iter(buffer_rows) + return detect_rows - def detect_colspecs(self, n=100): + def detect_colspecs(self, n=100, skiprows=None): # Regex escape the delimiters delimiters = ''.join([r'\%s' % x for x in self.delimiter]) pattern = re.compile('([^%s]+)' % delimiters) - rows = self.get_rows(n) + rows = self.get_rows(n, skiprows) + if not rows: + raise EmptyDataError("No rows from which to infer column width") max_len = max(map(len, rows)) mask = np.zeros(max_len + 1, dtype=int) if self.comment is not None: @@ -3081,7 +3110,8 @@ def detect_colspecs(self, n=100): shifted = np.roll(mask, 1) shifted[0] = 0 edges = np.where((mask ^ shifted) == 1)[0] - return list(zip(edges[::2], edges[1::2])) + edge_pairs = list(zip(edges[::2], edges[1::2])) + return edge_pairs def __next__(self): if self.buffer is not None: @@ -3106,9 +3136,8 @@ class FixedWidthFieldParser(PythonParser): def __init__(self, f, **kwds): # Support iterators, convert to a list. self.colspecs = kwds.pop('colspecs') - PythonParser.__init__(self, f, **kwds) def _make_reader(self, f): self.data = FixedWidthReader(f, self.colspecs, self.delimiter, - self.comment) + self.comment, self.skiprows) diff --git a/pandas/io/tests/parser/test_read_fwf.py b/pandas/io/tests/parser/test_read_fwf.py index 42b1116280a1e..a423355081ac3 100644 --- a/pandas/io/tests/parser/test_read_fwf.py +++ b/pandas/io/tests/parser/test_read_fwf.py @@ -16,7 +16,7 @@ from pandas import DataFrame from pandas import compat from pandas.compat import StringIO, BytesIO -from pandas.io.parsers import read_csv, read_fwf +from pandas.io.parsers import read_csv, read_fwf, EmptyDataError class TestFwfParsing(tm.TestCase): @@ -248,83 +248,83 @@ def test_bool_header_arg(self): def test_full_file(self): # File with all values - test = '''index A B C + test = """index A B C 2000-01-03T00:00:00 0.980268513777 3 foo 2000-01-04T00:00:00 1.04791624281 -4 bar 2000-01-05T00:00:00 0.498580885705 73 baz 2000-01-06T00:00:00 1.12020151869 1 foo 2000-01-07T00:00:00 0.487094399463 0 bar 2000-01-10T00:00:00 0.836648671666 2 baz -2000-01-11T00:00:00 0.157160753327 34 foo''' +2000-01-11T00:00:00 0.157160753327 34 foo""" colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) expected = read_fwf(StringIO(test), colspecs=colspecs) tm.assert_frame_equal(expected, read_fwf(StringIO(test))) def test_full_file_with_missing(self): # File with missing values - test = '''index A B C + test = """index A B C 2000-01-03T00:00:00 0.980268513777 3 foo 2000-01-04T00:00:00 1.04791624281 -4 bar 0.498580885705 73 baz 2000-01-06T00:00:00 1.12020151869 1 foo 2000-01-07T00:00:00 0 bar 2000-01-10T00:00:00 0.836648671666 2 baz - 34''' + 34""" colspecs = ((0, 19), (21, 35), (38, 40), (42, 45)) expected = read_fwf(StringIO(test), colspecs=colspecs) tm.assert_frame_equal(expected, read_fwf(StringIO(test))) def test_full_file_with_spaces(self): # File with spaces in columns - test = ''' + test = """ Account Name Balance CreditLimit AccountCreated 101 Keanu Reeves 9315.45 10000.00 1/17/1998 312 Gerard Butler 90.00 1000.00 8/6/2003 868 Jennifer Love Hewitt 0 17000.00 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 5000.00 2/5/2007 -'''.strip('\r\n') +""".strip('\r\n') colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) tm.assert_frame_equal(expected, read_fwf(StringIO(test))) def test_full_file_with_spaces_and_missing(self): # File with spaces and missing values in columsn - test = ''' + test = """ Account Name Balance CreditLimit AccountCreated 101 10000.00 1/17/1998 312 Gerard Butler 90.00 1000.00 8/6/2003 868 5/25/1985 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 -'''.strip('\r\n') +""".strip('\r\n') colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70)) expected = read_fwf(StringIO(test), colspecs=colspecs) tm.assert_frame_equal(expected, read_fwf(StringIO(test))) def test_messed_up_data(self): # Completely messed up file - test = ''' + test = """ Account Name Balance Credit Limit Account Created 101 10000.00 1/17/1998 312 Gerard Butler 90.00 1000.00 761 Jada Pinkett-Smith 49654.87 100000.00 12/5/2006 317 Bill Murray 789.65 -'''.strip('\r\n') +""".strip('\r\n') colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79)) expected = read_fwf(StringIO(test), colspecs=colspecs) tm.assert_frame_equal(expected, read_fwf(StringIO(test))) def test_multiple_delimiters(self): - test = r''' + test = r""" col1~~~~~col2 col3++++++++++++++++++col4 ~~22.....11.0+++foo~~~~~~~~~~Keanu Reeves 33+++122.33\\\bar.........Gerard Butler ++44~~~~12.01 baz~~Jennifer Love Hewitt ~~55 11+++foo++++Jada Pinkett-Smith ..66++++++.03~~~bar Bill Murray -'''.strip('\r\n') +""".strip('\r\n') colspecs = ((0, 4), (7, 13), (15, 19), (21, 41)) expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=' +~.\\') @@ -335,11 +335,11 @@ def test_variable_width_unicode(self): if not compat.PY3: raise nose.SkipTest( 'Bytes-related test - only needs to work on Python 3') - test = ''' + test = """ שלום שלום ום שלל של ום -'''.strip('\r\n') +""".strip('\r\n') expected = read_fwf(BytesIO(test.encode('utf8')), colspecs=[(0, 4), (5, 9)], header=None, encoding='utf8') @@ -347,10 +347,10 @@ def test_variable_width_unicode(self): BytesIO(test.encode('utf8')), header=None, encoding='utf8')) def test_dtype(self): - data = ''' a b c + data = """ a b c 1 2 3.2 3 4 5.2 -''' +""" colspecs = [(0, 5), (5, 10), (10, None)] result = pd.read_fwf(StringIO(data), colspecs=colspecs) expected = pd.DataFrame({ @@ -365,3 +365,41 @@ def test_dtype(self): result = pd.read_fwf(StringIO(data), colspecs=colspecs, dtype={'a': 'float64', 'b': str, 'c': 'int32'}) tm.assert_frame_equal(result, expected) + + def test_skiprows_inference(self): + # GH11256 + test = """ +Text contained in the file header + +DataCol1 DataCol2 + 0.0 1.0 + 101.6 956.1 +""".strip() + expected = read_csv(StringIO(test), skiprows=2, + delim_whitespace=True) + tm.assert_frame_equal(expected, read_fwf( + StringIO(test), skiprows=2)) + + def test_skiprows_by_index_inference(self): + test = """ +To be skipped +Not To Be Skipped +Once more to be skipped +123 34 8 123 +456 78 9 456 +""".strip() + + expected = read_csv(StringIO(test), skiprows=[0, 2], + delim_whitespace=True) + tm.assert_frame_equal(expected, read_fwf( + StringIO(test), skiprows=[0, 2])) + + def test_skiprows_inference_empty(self): + test = """ +AA BBB C +12 345 6 +78 901 2 +""".strip() + + with tm.assertRaises(EmptyDataError): + read_fwf(StringIO(test), skiprows=3)