diff --git a/doc/source/release.rst b/doc/source/release.rst index 285cea7938f91..9c2032212e3c8 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -457,6 +457,7 @@ Bug Fixes weren't strings (:issue:`4956`) - Fixed ``copy()`` to shallow copy axes/indices as well and thereby keep separate metadata. (:issue:`4202`, :issue:`4830`) + - Fixed skiprows option in Python parser for read_csv (:issue:`4382`) pandas 0.12.0 ------------- diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 7b9347a821fad..380fd04fb4433 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1283,7 +1283,6 @@ def __init__(self, f, **kwds): # needs to be cleaned/refactored # multiple date column thing turning into a real spaghetti factory - if not self._has_complex_date_col: (index_names, self.orig_names, _) = self._get_index_name(self.columns) @@ -1561,8 +1560,6 @@ def _get_index_name(self, columns): except StopIteration: next_line = None - index_name = None - # implicitly index_col=0 b/c 1 fewer column names implicit_first_cols = 0 if line is not None: @@ -1647,11 +1644,20 @@ def _get_lines(self, rows=None): if self.pos > len(source): raise StopIteration if rows is None: - lines.extend(source[self.pos:]) - self.pos = len(source) + new_rows = source[self.pos:] + new_pos = len(source) else: - lines.extend(source[self.pos:self.pos + rows]) - self.pos += rows + new_rows = source[self.pos:self.pos + rows] + new_pos = self.pos + rows + + # Check for stop rows. n.b.: self.skiprows is a set. + if self.skiprows: + new_rows = [row for i, row in enumerate(new_rows) + if i + self.pos not in self.skiprows] + + lines.extend(new_rows) + self.pos = new_pos + else: new_rows = [] try: @@ -1673,6 +1679,9 @@ def _get_lines(self, rows=None): raise Exception(msg) raise except StopIteration: + if self.skiprows: + new_rows = [row for i, row in enumerate(new_rows) + if self.pos + i not in self.skiprows] lines.extend(new_rows) if len(lines) == 0: raise diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index fb2b3fdd33bf1..16cc53976e862 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -735,6 +735,14 @@ def test_skiprows_bug(self): tm.assert_frame_equal(data, expected) tm.assert_frame_equal(data, data2) + def test_deep_skiprows(self): + # GH #4382 + text = "a,b,c\n" + "\n".join([",".join([str(i), str(i+1), str(i+2)]) for i in range(10)]) + condensed_text = "a,b,c\n" + "\n".join([",".join([str(i), str(i+1), str(i+2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]) + data = self.read_csv(StringIO(text), skiprows=[6, 8]) + condensed_data = self.read_csv(StringIO(condensed_text)) + tm.assert_frame_equal(data, condensed_data) + def test_detect_string_na(self): data = """A,B foo,bar